From ebe5e8d264d5084096ddece7750744a854369d2b Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 26 Sep 2017 12:37:35 -0400
Subject: [PATCH 01/38] [egs] Bug fix in train_raw_dnn.py

---
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 030be1ad8b8..38396f0b4e7 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -303,19 +303,19 @@ def train(args, run_opts):
     else:
         models_to_combine = None
 
-    if os.path.exists('{0}/valid_diagnostic.scp'.format(args.egs_dir)):
-        if os.path.exists('{0}/valid_diagnostic.egs'.format(args.egs_dir)):
+    if os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir)):
+        if os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)):
             raise Exception('both {0}/valid_diagnostic.egs and '
                             '{0}/valid_diagnostic.scp exist.'
                             'This script expects only one of them to exist.'
-                            ''.format(args.egs_dir))
+                            ''.format(egs_dir))
         use_multitask_egs = True
     else:
-        if not os.path.exists('{0}/valid_diagnostic.egs'.format(args.egs_dir)):
+        if not os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)):
             raise Exception('neither {0}/valid_diagnostic.egs nor '
                             '{0}/valid_diagnostic.scp exist.'
                             'This script expects one of them.'
-                            ''.format(args.egs_dir))
+                            ''.format(egs_dir))
         use_multitask_egs = False
 
     logger.info("Training will run for {0} epochs = "

From fbedee05b61f479b9e58d2699cc85656ed234a0e Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 1 Nov 2017 14:59:54 -0400
Subject: [PATCH 02/38] steps/cleanup: Fixed corner case in
 resolve_ctm_edits_overlaps.py

---
 .../cleanup/internal/resolve_ctm_edits_overlaps.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
index be58ccac855..09cc90c4b60 100755
--- a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
+++ b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
@@ -155,8 +155,8 @@ def resolve_overlaps(ctm_edits, segments):
     Returns new lines of CTM for the recording.
 
     Arguments:
-        ctms - The CTM lines for a single recording. This is one value stored
-            in the dictionary read by read_ctm(). Assumes that the lines
+        ctm_edits - The CTM lines for a single recording. This is one value
+            stored in the dictionary read by read_ctm(). Assumes that the lines
             are sorted by the utterance-ids.
             The format is the following:
             [[(utteranceA, channelA, start_time1, duration1, hyp_word1, conf1),
@@ -171,13 +171,12 @@ def resolve_overlaps(ctm_edits, segments):
              [...
               (utteranceZ, channelZ, start_timeN, durationN, hyp_wordN, confN)]
             ]
+            Expects this to be non-empty.
         segments - Dictionary containing the output of read_segments()
             { utterance_id: (recording_id, start_time, end_time) }
         """
     total_ctm_edits = []
-    if len(ctm_edits) == 0:
-        raise RuntimeError('CTMs for recording is empty. '
-                           'Something wrong with the input ctms')
+    assert len(ctm_edits) > 0
 
     # First column of first line in CTM for first utterance
     next_utt = ctm_edits[0][0][0]
@@ -306,6 +305,11 @@ def run(args):
             if (reco, utt) in ctm_edits:
                 ctm_edits_for_reco.append(ctm_edits[(reco, utt)])
         try:
+            if len(ctm_edits_for_reco) == 0:
+                logger.warn('CTMs for recording %s is empty.',
+                             reco)
+                break   # Go to the next recording
+
             # Process CTMs in the recordings
             ctm_edits_for_reco = resolve_overlaps(ctm_edits_for_reco, segments)
             write_ctm_edits(ctm_edits_for_reco, args.ctm_edits_out)

From f0627cf122112062f4841805528df414aa124a6a Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 6 Jan 2017 14:26:48 -0500
Subject: [PATCH 03/38] bn: Adding BN recipe

---
 egs/bn/s5/README                              |    6 +
 egs/bn/s5/cmd.sh                              |   14 +
 egs/bn/s5/conf/merge_vad_map.txt              |   16 +
 egs/bn/s5/conf/mfcc.conf                      |    6 +
 egs/bn/s5/conf/vad.conf                       |    2 +
 .../local/data_prep/csr_hub4_utils/INVENTORY  |   56 +
 .../s5/local/data_prep/csr_hub4_utils/README  |   34 +
 .../local/data_prep/csr_hub4_utils/abbrlist   | 2403 +++++++++++++++++
 .../data_prep/csr_hub4_utils/abbrproc.perl    |  465 ++++
 .../data_prep/csr_hub4_utils/addressforms     |   38 +
 .../data_prep/csr_hub4_utils/artfilter.perl   |   83 +
 .../data_prep/csr_hub4_utils/bugproc.perl     |   69 +
 .../s5/local/data_prep/csr_hub4_utils/do-lm   |   43 +
 .../csr_hub4_utils/eval-material.ptrns        |    4 +
 .../local/data_prep/csr_hub4_utils/num_excp   |  528 ++++
 .../data_prep/csr_hub4_utils/numhack.perl     |   80 +
 .../data_prep/csr_hub4_utils/numproc.perl     | 1134 ++++++++
 .../data_prep/csr_hub4_utils/pare-sgml.perl   |   36 +
 .../csr_hub4_utils/process_filelist.py        |  164 ++
 .../csr_hub4_utils/process_filelist.sh        |   30 +
 .../data_prep/csr_hub4_utils/progsummary.perl |   44 +
 .../data_prep/csr_hub4_utils/puncproc.perl    |  196 ++
 .../data_prep/csr_hub4_utils/sent-init.vocab  |  411 +++
 .../local/data_prep/csr_hub4_utils/sentag.c   |  674 +++++
 .../csr_hub4_utils/tr-bn-char.fast.perl       |   13 +
 .../csr_hub4_utils/tr-bn-char.slow.perl       |   46 +
 egs/bn/s5/local/data_prep/do-lm-csr96         |   40 +
 .../data_prep/prepare_1996_csr_hub4_corpus.sh |   51 +
 .../prepare_1998_hub4_bn_eng_eval.sh          |   87 +
 egs/bn/s5/local/data_prep/prepare_bn_data.py  |  208 ++
 .../data_prep/prepare_na_news_text_corpus.sh  |   51 +
 .../local/data_prep/process_na_news_text.py   |   91 +
 egs/bn/s5/local/dict                          |    1 +
 egs/bn/s5/local/format_data.sh                |   28 +
 egs/bn/s5/local/format_lms.sh                 |   47 +
 egs/bn/s5/local/lm/merge_word_counts.py       |   30 +
 egs/bn/s5/local/lm/text_normalization.py      |   42 +
 egs/bn/s5/local/normalize_transcripts.pl      |   47 +
 egs/bn/s5/local/prepare_dict.sh               |  191 ++
 egs/bn/s5/local/run_cleanup_segmentation.sh   |   93 +
 egs/bn/s5/local/score.sh                      |    1 +
 egs/bn/s5/local/score_sclite.sh               |   94 +
 egs/bn/s5/local/train_lm.sh                   |  149 +
 egs/bn/s5/path.sh                             |    6 +
 egs/bn/s5/run.sh                              |   90 +
 egs/bn/s5/steps                               |    1 +
 egs/bn/s5/utils                               |    1 +
 47 files changed, 7944 insertions(+)
 create mode 100644 egs/bn/s5/README
 create mode 100755 egs/bn/s5/cmd.sh
 create mode 100644 egs/bn/s5/conf/merge_vad_map.txt
 create mode 100644 egs/bn/s5/conf/mfcc.conf
 create mode 100644 egs/bn/s5/conf/vad.conf
 create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY
 create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/README
 create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
 create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm
 create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
 create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl
 create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
 create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
 create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
 create mode 100755 egs/bn/s5/local/data_prep/do-lm-csr96
 create mode 100755 egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh
 create mode 100755 egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
 create mode 100755 egs/bn/s5/local/data_prep/prepare_bn_data.py
 create mode 100755 egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh
 create mode 100755 egs/bn/s5/local/data_prep/process_na_news_text.py
 create mode 120000 egs/bn/s5/local/dict
 create mode 100755 egs/bn/s5/local/format_data.sh
 create mode 100755 egs/bn/s5/local/format_lms.sh
 create mode 100755 egs/bn/s5/local/lm/merge_word_counts.py
 create mode 100644 egs/bn/s5/local/lm/text_normalization.py
 create mode 100755 egs/bn/s5/local/normalize_transcripts.pl
 create mode 100755 egs/bn/s5/local/prepare_dict.sh
 create mode 100755 egs/bn/s5/local/run_cleanup_segmentation.sh
 create mode 120000 egs/bn/s5/local/score.sh
 create mode 100755 egs/bn/s5/local/score_sclite.sh
 create mode 100755 egs/bn/s5/local/train_lm.sh
 create mode 100755 egs/bn/s5/path.sh
 create mode 100755 egs/bn/s5/run.sh
 create mode 120000 egs/bn/s5/steps
 create mode 120000 egs/bn/s5/utils

diff --git a/egs/bn/s5/README b/egs/bn/s5/README
new file mode 100644
index 00000000000..8a8ae65108d
--- /dev/null
+++ b/egs/bn/s5/README
@@ -0,0 +1,6 @@
+ The MUSAN corpus is required for system training. It is available at: 
+   http://www.openslr.org/17/
+
+ The test requires Broadcast News data. The LDC Catalog numbers are:
+   Speech      LDC97S44
+   Transcripts LDC97T22
diff --git a/egs/bn/s5/cmd.sh b/egs/bn/s5/cmd.sh
new file mode 100755
index 00000000000..43f7b21771a
--- /dev/null
+++ b/egs/bn/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 1G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/bn/s5/conf/merge_vad_map.txt b/egs/bn/s5/conf/merge_vad_map.txt
new file mode 100644
index 00000000000..216dee78b65
--- /dev/null
+++ b/egs/bn/s5/conf/merge_vad_map.txt
@@ -0,0 +1,16 @@
+# This table defines the mapping used by the binary merge-vads to 
+# combine the output of compute-vad and compute-vad-from-frame-likes.
+# The first column corresponds to VAD decisions from compute-vad
+# and the second corresponds to VAD decisions from
+# compute-vad-from-frame-likes.  The labels "0" and "1" in the
+# first column represent (approximately) silence and nonsilence
+# respectively.  The labels "0," "1," and "2" in the second column
+# represent noise, speech, and music, respectively.  The third
+# column lists the resulting output labels: "0," "1," and "2" 
+# corresponding to silence/noise, speech, and music. 
+0 0 0
+1 0 0
+0 1 0
+1 1 1
+0 2 0
+1 2 2
diff --git a/egs/bn/s5/conf/mfcc.conf b/egs/bn/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a4be40be454
--- /dev/null
+++ b/egs/bn/s5/conf/mfcc.conf
@@ -0,0 +1,6 @@
+--sample-frequency=16000 
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
+--num-ceps=20 # higher than the default which is 12.
+--snip-edges=false
diff --git a/egs/bn/s5/conf/vad.conf b/egs/bn/s5/conf/vad.conf
new file mode 100644
index 00000000000..a0ca2449b10
--- /dev/null
+++ b/egs/bn/s5/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY b/egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY
new file mode 100644
index 00000000000..73229812231
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY
@@ -0,0 +1,56 @@
+INVENTORY
+	This file, a short description of included tools.
+README
+	Introduction to the tools.
+abbrlist
+abbrproc.perl
+	Part of LM conditioning pipeline.
+	Spells out abbreviations and such.
+	"abbrlist" is an auxiliary data file for abbrproc.
+addressforms
+	auxiliary file used by "sentag.c"
+artfilter.perl
+	Selects articles based on content of SGML tags.
+bugproc.perl
+	Part of LM conditioning pipeline.
+	Corrects a few common typos and non-standard spellings.
+do-lm
+	Bourne shell script that executes language modeling conditioning
+	pipeline.
+eval-material.ptrns
+	Pattern file used to separate reserved "test" (evaluation) articles
+	from "train" articles (training material).  Used with "artfilter"
+	program along the following lines:
+	    foreach $file
+		artfilter.perl -t program -f eval-material.ptrns -v -r \
+		 $file.test $file > $file.train
+num_excp
+numhack.perl
+numproc.perl
+	Part of LM conditioning pipeline.
+	Spells out numberical expressions.
+	"num_excp" is an auxiliary data file for numproc.
+	"numhack.perl" is a new module for phone numbers and zip codes.
+pare-sgml.perl
+	Part of LM conditioning pipeline.
+	Removes extraneous SGML tagging and transcriber comments enclosed
+	in brackets.
+progsummary.perl
+	extracts program information from sgml-ized PSM texts
+puncproc.perl
+	Part of LM conditioning pipeline.
+	Verbalizes punctuation (or removes, with -np switch).
+sent-init.vocab
+sentag.c
+	Program used to tag sentences in "raw" version.
+	Revised since last CSR_LM95 to handle over-long
+	sentences/paragraphs and to pass material lacking any obvious
+	end-of-sentence markers or alphabetic characters, since the
+	transcriptions are more likely to contain such text.  Uses
+	auxiliary "sent-init.vocab" file.
+tr-bn-char.fast.perl
+tr-bn-char.slow.perl
+	Program used to translate 8-bit character encoding occasionally
+	found in the documents.  The two versions should be identical in
+	output; the "slow" version is more readable while the "fast"
+	version is more efficient.
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/README b/egs/bn/s5/local/data_prep/csr_hub4_utils/README
new file mode 100644
index 00000000000..fa73f3a4dc3
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/README
@@ -0,0 +1,34 @@
+SOFTWARE ACCOMPANYING CSR LM DATA
+---------------------------------
+
+The files in this directory fall into three categories:
+
+(1) C source code (*.c)
+
+(2) Perl source code (*.perl)
+
+(3) shell scripts and auxiliary data files
+
+In general, program summaries are provided within the source files, and are
+often printed on stderr by the programs when the user enters some
+unsuitable command line option (e.g. -h).  The "do-lm" shell script shows
+the components that were used in the LM-conditioning pipeline; other perl
+programs were used for data summaries and correction of minor glitches.
+See INVENTORY for more information.
+
+For further information on these programs, please contact Robert MacIntyre
+or David Graff at the Linguistic Data Consortium:
+  robertm@ldc.upenn.edu, (215) 573-5491
+  graff@ldc.upenn.edu, (215) 898-0887
+
+While disclaimers have not been systematically placed in all source
+code files, users are expected to understand that the following
+applies to all source code files in this directory, unless otherwise
+noted in particular files:
+
+This software is being provided by the Linguistic Data Consortium, and
+the University of Pennsylvania, without any guarantee, warrantee or
+implication about its correctness, usefulness or suitability to any
+purpose.  You may copy, modify and redistribute it, but you may not
+hold the LDC or Univ. of Penn. responsible for any damages resulting
+from its use.
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist b/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist
new file mode 100644
index 00000000000..0c15bbd2eb5
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist
@@ -0,0 +1,2403 @@
+###############################################################################
+# This software is being provided to you, the LICENSEE, by the Massachusetts  #
+# Institute of Technology (M.I.T.) under the following license.  By           #
+# obtaining, using and/or copying this software, you agree that you have      #
+# read, understood, and will comply with these terms and conditions:          #
+#                                                                             #
+# Permission to use, copy, modify and distribute, including the right to      #
+# grant others the right to distribute at any tier, this software and its     #
+# documentation for any purpose and without fee or royalty is hereby granted, #
+# provided that you agree to comply with the following copyright notice and   #
+# statements, including the disclaimer, and that the same appear on ALL       #
+# copies of the software and documentation, including modifications that you  #
+# make for internal use or for distribution:                                  #
+#                                                                             #
+# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
+# reserved.                                                                   #
+#                                                                             #
+# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
+# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
+# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
+# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
+# TRADEMARKS OR OTHER RIGHTS.                                                 #
+#                                                                             #
+# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
+# used in advertising or publicity pertaining to distribution of the          #
+# software.  Title to copyright in this software and any associated           #
+# documentation shall at all times remain with M.I.T., and USER agrees to     #
+# preserve same.                                                              #
+###############################################################################
+
+# abbreviation list
+# derived from unigram file 29 Aug 91 mods to 17 Sept 91
+# x.y. mapped to x. y. in program
+
+# true abbreviations (must end with .)
+# if key includes lower case, an upper case version will be created
+Adm.	Admiral
+Ala.	Alabama
+Alex.	Alexander
+Apr.	April
+Ariz.	Arizona
+Ark.	Arkansas
+AUG.	AUGUST
+Aug.	August
+Ave.	Avenue
+Bancorp.	Bancorp
+Bhd.	B. H. D.
+Blvd.	Boulevard
+Brig.	Brigadeer
+Bros.	Brothers
+Cal.	Calorie
+Ca.	California
+Calif.	California
+Capt.	Captain
+Cie.	Company
+Cmdr.	Commander
+Co.	Company
+co.	Company
+Col.	Colonel
+Colo.	Colorado
+Conn.	Connecticut
+Corp.	Corporation
+Cos.	Companies
+Cpl.	Corporal
+Dec.	December
+Del.	Delaware
+Dept.	Department
+Dr.	Doctor
+Drs.	Doctors
+Feb.	February
+Fla.	Florida
+Fr.	Friar
+Fri.	Friday
+Ft.	Fort
+Ga.	Georgia
+Gen.	General
+Gov.	Governor
+Ill.	Illinois
+Inc.	Incorporated
+Ind.	Indiana
+InfoCorp.	InfoCorp
+Infocorp.	InfoCorp
+Intercorp.	Intercorp
+Jan.	January
+Jr.	Junior
+Jul.	July
+Jun.	June
+Kan.	Kansas
+Ky.	Kentucky
+La.	Louisiana
+lb.	pound
+lbs.	pounds
+Lt.	Lieutenant
+Ltd.	Limited
+Ltda.	Company
+Maj.	Major
+Mar.	March
+Mass.	Massachusetts
+MCorp.	M. Corporation
+Md.	Maryland
+Me.	Maine
+# Some Italian company
+Me.T.A.	M. E. T. A.
+Mfg.	Manufacturing
+Mich.	Michigan
+Minn.	Minnesota
+Miss.	Mississippi
+Mo.	Missouri
+Mt.	Mountain
+Mont.	Montana
+#			meaning of mistress has changed + symmetry
+#Mr.	Mister
+#Mrs.	Mistress
+#Ms.	Miz
+#Messrs.
+#
+Neb.	Nebraska
+Nev.	Nevada
+No.	Number
+Nos.	Numbers
+Nov.	November
+Oct.	October
+Okla.	Oklahoma
+Ont.	Ontario
+Op.	Opus
+Ore.	Oregon
+Pa.	Pennsylvania
+PacifiCorp.	PacifiCorp
+Penn.	Pennsylvania
+PHLCorp.	P. H. L. Corporation
+Ph.D.	P. H. D.
+PhD.	P. H. D.
+Prof.	Professor
+Prop.	Proposition
+Pte.	Point
+Pty.	Party
+Pvt.	Private
+Rep.	Representative
+Reps.	Representatives
+Rev.	Reverend
+Sen.	Senator
+Sens.	Senators
+Sept.	September
+Sgt.	Sargent
+S.p.A.	Company
+Sr.	Senior
+#St.	Street or Saint		Context dependent (in abbrevproc)
+Ste.	Saint
+Tel.	Telephone
+Tenn.	Tennessee
+Tex.	Texas
+Va.	Virginia
+Vt.	Vermont
+W.Va.	West Virginia
+Wash.	Washington
+Wis.	Wisconsin
+Wyo.	Wyoming
+Yr.	Year
+etc.	et-cetera
+Etc.	Et-cetera
+ft.	feet
+inc.	incorporated
+mfg.	manufacturing
+vs.	versus
+
+# left contexts for roman cardinal numerals
+# case independent comparisons
+*r	Act
+*r	Advantage
+*r	amendment
+*r	angiotensin
+*r	Antrim
+*r	Appendix
+*r	Apple
+*r	Arrow
+*r	Article
+*r	Associates
+*r	Astros
+*r	Bank
+*r	Bowl
+*r	Bronco
+*r	Busch
+*r	CSPAN
+*r	Canada
+*r	Century
+*r	Class
+*r	Cleopatra
+*r	Concepts
+*r	Cop
+*r	dBase
+*r	database
+*r	Delta
+*r	Detente
+*r	Dundee
+*r	Esprit
+*r	Explorer
+*r	Express
+*r	Eyes
+*r	Factor
+*r	Ford
+*r	Freaks
+*r	Fund
+*r	Funding
+*r	Funds
+*r	Future
+*r	GOD
+*r	GSTAR
+*r	Gemini
+*r	Ghostbusters
+*r	Global
+*r	Group
+*r	Gulfstream
+*r	Hybrid
+*r	Intelsat
+*r	Investment
+*r	Investments
+*r	Iron
+*r	Jets
+*r	Journalism
+*r	Kong
+# 		LaSalle nuclear plant
+*r	LaSalle
+*r	LaserWriter
+*r	Lighthouse
+*r	Linen
+*r	Mark
+*r	Mac
+*r	MacDraw
+*r	MacProject
+*r	Macintosh
+*r	Management
+*r	Mark
+*r	Metro
+*r	MicroVAX
+*r	Minuteman
+*r	Monopoly
+*r	Notes
+*r	numeral
+*r	OPEC
+*r	Officer
+*r	Overseas
+*r	Part
+*r	Partners
+*r	Pershing
+*r	Phantasm
+*r	Phase
+*r	Phobos
+*r	Pioneer
+*r	Pirate
+*r	Play
+*r	Plus
+*r	Point
+*r	Portable
+*r	Quick
+*r	Rambo
+*r	Ransom
+*r	Resorts
+*r	SALT
+*r	Screen
+*r	Series
+*r	Stage
+*r	Superman
+*r	System
+*r	TIAA
+*r	Titan
+*r	Title
+*r	Toxic
+*r	Trac
+*r	Trek
+*r	Trident
+*r	Trooper
+*r	Trust
+*r	Ultima
+*r	Vatican
+*r	Ventures
+*r	Volume
+*r	WW
+*r	War
+*r	Weapon
+*r	Wespac
+*r	Westar
+*r	Wrestlemania
+
+# Roman ordinals (I, V, and X not included: too many false hits)
+II	the second
+III	the third
+IV	the fourth
+VI	the sixth
+VII	the seventh
+VIII	the eighth
+IX	the ninth
+XI	the eleventh
+XII	the twelfth
+XIII	the thirteenth
+XIV	the fourteenth
+XV	the fifteenth
+XVI	the sixteenth
+XVII	the seventeenth
+XVIII	the eighteenth
+XIX	the nineteenth
+XX	the twentieth
+XXI	the twenty-first
+XXII	the twenty-second
+XXIII	the twenty-third
+XXIV	the twenty-fourth
+XXV	the twenty-fifth
+
+# acronyms (not ending in .) needing translation
+# if key includes lower case, an upper case version will be created
+# keys can include - / & .
+AA	Double A.
+AAA	Triple A.
+AAI	A. A. I.
+AAP	A. A. P.
+AAR	A. A. R.
+AARP	A. A. R. P.
+AAS	A. A. S.
+AB	A. B.
+ABA	A. B. A.
+ABB	A. B. B.
+ABC	A. B. C.
+ABD	A. B. D.
+ABF	A. B. F.
+ABI	A. B. I.
+ABM	A. B. M.
+ABN	A. B. N.
+ABS	A. B. S.
+ABT	A. B. T.
+AC	A. C.
+ACA	A. C. A.
+ACC	A. C. C.
+ACCT	A. C. C. T.
+ACEC	A. C. E. C.
+ACF	A. C. F.
+ACI	A. C. I.
+ACLI	A. C. L. I.
+ACLU	A. C. L. U.
+ACM	A. C. M.
+ACO	A. C. O.
+ACP	A. C. P.
+ACSH	A. C. S. H.
+ACTV	A. C. T. V.
+ADB	A. D. B.
+ADC	A. D. C.
+ADI	A. D. I.
+ADIA	A. D. I. A.
+ADM	A. D. M.
+ADN	A. D. N.
+ADP	A. D. P.
+ADR	A. D. R.
+ADT	A. D. T.
+ADV	A. D. V.
+adv	A. D. V.
+AD&P	A. D. & P.
+AD/SAT	AD / SAT
+AE	A. E.
+AEA	A. E. A.
+AEC	A. E. C.
+AEG	A. E. G.
+AEI	A. E. I.
+AEL	A. E. L.
+AEP	A. E. P.
+AER	A. E. R.
+AES	A. E. S.
+AEU	A. E. U.
+AEW	A. E. W.
+AFA	A. F. A.
+AFC	A. F. C.
+AFCO	A. F. C. O.
+AFDC	A. F. D. C.
+AFG	A. F. G.
+AFGE	A. F. G. E.
+AFIS	A. F. I. S.
+AFL	A. F. L.
+AFP	A. F. P.
+AFSCME	A. F. S. C. M. E.
+AG	A. G.
+AGA	A. G. A.
+AGB	A. G. B.
+AGEF	A. G. E. F.
+AGF	A. G. F.
+AGI	A. G. I.
+AGIP	A. G. I. P.
+AGS	A. G. S.
+AGT	A. G. T.
+AHA	A. H. A.
+AHL	A. H. L.
+AI	A. I.
+AIBD	A. I. B. D.
+AIC	A. I. C.
+AICPA	A. I. C. P. A.
+AIFS	A. I. F. S.
+AIG	A. I. G.
+AIL	A. I. L.
+AIME	A. I. M. E.
+AIT	A. I. T.
+AIW	A. I. W.
+AIX	A. I. X.
+AK	A. K.
+AKA	A. K. A.
+ALC	A. L. C.
+ALQ	A. L. Q.
+ALR	A. L. R.
+AM	A. M.
+AMA	A. M. A.
+AMC	A. M. C.
+AMCA	A. M. C. A.
+AMCC	A. M. C. C.
+AMD	A. M. D.
+AME	A. M. E.
+AMF	A. M. F.
+AMG	A. M. G.
+AMI	A. M. I.
+AML	A. M. L.
+AMO	A. M. O.
+AMP	A. M. P.
+AMR	A. M. R.
+AMT	A. M. T.
+ANB	A. N. B.
+ANC	A. N. C.
+ANF	A. N. F.
+ANMC	A. N. M. C.
+ANR	A. N. R.
+ANWR	A. N. W. R.
+ANZ	A. N. Z.
+AO	A. O.
+AOC	A. O. C.
+AOI	A. O. I.
+AOK	A. O. K.
+AON	A. O. N.
+AP	A. P.
+A&P	A. & P.
+APA	A. P. A.
+APAC	A. P. A. C.
+API	A. P. I.
+APL	A. P. L.
+APMA	A. P. M. A.
+APN	A. P. N.
+APPWP	A. P. P. W. P.
+APR	A. P. R.
+APS	A. P. S.
+APSAC	A. P. S. A. C.
+APV	A. P. V.
+APW	A. P. W.
+ARA	A. R. A.
+ARB	A. R. B.
+ARD	A. R. D.
+ARX	A. R. X.
+ASA	A. S. A.
+ASB	A. S. B.
+ASC	A. S. C.
+ASEA	A. S. E. A.
+ASI	A. S. I.
+ASPCA	A. S. P. C. A.
+AST	A. S. T.
+AT	A. T.
+ATA	A. T. A.
+ATC	A. T. C.
+ATF	A. T. F.
+ATI	A. T. I.
+ATM	A. T. M.
+ATN	A. T. N.
+ATR	A. T. R.
+ATS	A. T. S.
+AT&T	A. T. & T.
+ATV	A. T. V.
+AUS	A. U. S.
+AV	A. V.
+AVAQ	A. V. A. Q.
+AVC	A. V. C.
+AVX	A. V. X.
+AWA	A. W. A.
+AWD	A. W. D.
+AWOL	A. W. O. L.
+AWSJ	A. W. S. J.
+AWT	A. W. T.
+AXA	A. X. A.
+AXP	A. X. P.
+AY	A. Y.
+AZL	A. Z. L.
+AZP	A. Z. P.
+AZT	A. Z. T.
+BA	B. A.
+Ba	B. a.
+BAA	B. A. A.
+Baa	B. a. a.
+BAC	B. A. C.
+BAII	B. A. I. I.
+B.A.IT	B. A. IT
+BASF	B. A. S. F.
+B.A.T	B. A. T.
+BB	Double B.
+BBA	B. B. A.
+BBB	Triple B.
+BBC	B. B. C.
+BBDO	B. B. D. O.
+BBN	B. B. N.
+BC	B. C.
+BCA	B. C. A.
+BCCI	B. C. C. I.
+BCE	B. C. E.
+BCEAO	B. C. E. A. O.
+BCG	B. C. G.
+BCI	B. C. I.
+BCM	B. C. M.
+BCOA	B. C. O. A.
+BCS	B. C. S.
+BCV	B. C. V.
+BCW	B. C. W.
+BDC	B. D. C.
+BDDP	B. D. D. P.
+BDM	B. D. M.
+BDO	B. D. O.
+BDR	B. D. R.
+BEC	B. E. C.
+BEI	B. E. I.
+BF	B. F.
+BFEA	B. F. E. A.
+BFS	B. F. S.
+BGH	B. G. H.
+BGS	B. G. S.
+BHC	B. H. C.
+Bhd	B. H. D.
+BHF	B. H. F.
+BHP	B. H. P.
+BHS	B. H. S.
+BHW	B. H. W.
+BI	B. I.
+BIA	B. I. A.
+BICC	B. I. C. C.
+BiiN	B. i. i. N.
+BIP	B. I. P.
+BIR	B. I. R.
+BIS	B. I. S.
+BIW	B. I. W.
+BJ	B. J.
+BJF	B. J. F.
+BK	B. K.
+BL	B. L.
+BLM	B. L. M.
+BLS	B. L. S.
+BM	B. M.
+BMA	B. M. A.
+BMC	B. M. C.
+BMI	B. M. I.
+BMP	B. M. P.
+BMW	B. M. W.
+BMY	B. M. Y.
+BN	B. N.
+BNL	B. N. L.
+BNP	B. N. P.
+BNS	B. N. S.
+BNY	B. N. Y.
+BOC	B. O. C.
+BOJ	B. O. J.
+BOT	B. O. T.
+BP	B. P.
+bpd	B. P. D.
+BPB	B. P. B.
+BPC	B. P. C.
+BPCA	B. P. C. A.
+BPCC	B. P. C. C.
+BPD	B. P. D.
+BPI	B. P. I.
+BR	B. R.
+BRE	B. R. E.
+BRNF	B. R. N. F.
+BRT	B. R. T.
+BRZ	B. R. Z.
+BS	B. S.
+BSB	B. S. B.
+BSD	B. S. D.
+BSE	B. S. E.
+BSI	B. S. I.
+BSN	B. S. N.
+BSO	B. S. O.
+BST	B. S. T.
+BT	B. T.
+BTL	B. T. L.
+BTR	B. T. R.
+BTU	B. T. U.
+BV	B. V.
+BVI	B. V. I.
+BVL	B. V. L.
+BW	B. W.
+BWA	B. W. A.
+BWAC	B. W. A. C.
+BZ	B. Z.
+BZW	B. Z. W.
+CA	C. A.
+Ca	C. a.
+CAA	C. A. A.
+Caa	C. a. a.
+CAAC	C. A. A. C.
+CAC	C. A. C.
+CACI	C. A. C. I.
+CAD	C. A. D.
+CAE	C. A. E.
+CAID	C. A. I. D.
+CAMI	C. A. M. I.
+CARU	C. A. R. U.
+CATV	C. A. T. V.
+CAV	C. A. V.
+CAW	C. A. W.
+CB	C. B.
+CBC	C. B. C.
+CBI	C. B. I.
+CBN	C. B. N.
+CBO	C. B. O.
+CBOE	C. B. O. E.
+CBOT	C. B. O. T.
+CBS	C. B. S.
+CBT	C. B. T.
+CBW	C. B. W.
+CCA	C. C. A.
+CCC	C. C. C.
+CCD	C. C. D.
+CCE	C. C. E.
+CCH	C. C. H.
+CCK	C. C. K.
+CCL	C. C. L.
+CCX	C. C. X.
+CD	C. D.
+CDA	C. D. A.
+CDC	C. D. C.
+CDF	C. D. F.
+CDI	C. D. I.
+CDL	C. D. L.
+CDS	C. D. S.
+CDT	C. D. T.
+CDU	C. D. U.
+CDW	C. D. W.
+CE	C. E.
+CEA	C. E. A.
+CED	C. E. D.
+CEE	C. E. E.
+CEI	C. E. I.
+CEL	C. E. L.
+CEO	C. E. O.
+CEP	C. E. P.
+CES	C. E. S.
+CF	C. F.
+CFA	C. F. A.
+CFC	C. F. C.
+CFM	C. F. M.
+CFO	C. F. O.
+CFP	C. F. P.
+CFS	C. F. S.
+CFTC	C. F. T. C.
+CFTR	C. F. T. R.
+CGB	C. G. B.
+CGCT	C. G. C. T.
+CGE	C. G. E.
+CGM	C. G. M.
+CGS	C. G. S.
+CGT	C. G. T.
+CH	C. H.
+CHC	C. H. C.
+CHG	C. H. G.
+CI	C. I.
+CIA	C. I. A.
+CIBC	C. I. B. C.
+CIC	C. I. C.
+CID	C. I. D.
+CIE	C. I. E.
+CIGS	C. I. G. S.
+CIM	C. I. M.
+CIO	C. I. O.
+CIP	C. I. P.
+CIR	C. I. R.
+CIS	C. I. S.
+CIT	C. I. T.
+CJ	C. J.
+CJI	C. J. I.
+CJM	C. J. M.
+CK	C. K.
+CL	C. L.
+CLC	C. L. C.
+CLS	C. L. S.
+CLU	C. L. U.
+CLX	C. L. X.
+CM	C. M.
+CMA	C. M. A.
+CMB	C. M. B.
+CMC	C. M. C.
+CME	C. M. E.
+CMF	C. M. F.
+CMI	C. M. I.
+CML	C. M. L.
+CMO	C. M. O.
+CMQ	C. M. Q.
+CMS	C. M. S.
+CMV	C. M. V.
+CMS	C. M. X.
+CN	C. N.
+CNA	C. N. A.
+CNB	C. N. B.
+CNBC	C. N. B. C.
+CNCL	C. N. C. L.
+CNCP	C. N. C. P.
+CNFR	C. N. F. R.
+CNG	C. N. G.
+CNN	C. N. N.
+CNOOC	C. N. O. O. C.
+CNW	C. N. W.
+Corp	Corporation
+CP	C. P.
+CPA	C. P. A.
+CPAC	C. P. A. C.
+CPB	C. P. B.
+CPC	C. P. C.
+CPE	C. P. E.
+CPI	C. P. I.
+CPL	C. P. L.
+CPM	C. P. M.
+CPP	C. P. P.
+CPR	C. P. R.
+CPSC	C. P. S. C.
+CPT	C. P. T.
+CQ	C. Q.
+CR	C. R.
+CRA	C. R. A.
+CRB	C. R. B.
+CRC	C. R. C.
+CRI	C. R. I.
+CRL	C. R. L.
+CRS	C. R. S.
+CRT	C. R. T.
+CRTC	C. R. T. C.
+CRX	C. R. X.
+CS	C. S.
+CSA	C. S. A.
+CSB	C. S. B.
+CSC	C. S. C.
+CSF	C. S. F.
+CSFB	C. S. F. B.
+CSI	C. S. I.
+CSIS	C. S. I. S.
+CSK	C. S. K.
+CSO	C. S. O.
+CSR	C. S. R.
+CSS	C. S. S.
+CST	C. S. T.
+CSU	C. S. U.
+CSV	C. S. V.
+CSX	C. S. X.
+CT	C. T.
+CTA	C. T. A.
+CTB	C. T. B.
+CTBS	C. T. B. S.
+CTC	C. T. C.
+CTG	C. T. G.
+CTI	C. T. I.
+CTK	C. T. K.
+CTM	C. T. M.
+CTS	C. T. S.
+CTV	C. T. V.
+CU	C. U.
+CUC	C. U. C.
+CVB	C. V. B.
+CVG	C. V. G.
+CVN	C. V. N.
+CVNY	C. V. N. Y.
+CVS	C. V. S.
+CW	C. W.
+CWA	C. W. A.
+CWB	C. W. B.
+CWT	C. W. T.
+CX	C. X.
+CXR	C. X. R.
+DAF	D. A. F.
+DAP	D. A. P.
+DAX	D. A. X.
+DB	D. B.
+DBA	D. B. A.
+DBI	D. B. I.
+DBL	D. B. L.
+DBS	D. B. S.
+DC	D. C.
+DCCC	D. C. C. C.
+DCI	D. C. I.
+DCNY	D. C. N. Y.
+DD	D. D.
+DDA	D. D. A.
+DDB	D. D. B.
+DDC	D. D. C.
+DDG	D. D. G.
+DDI	D. D. I.
+DDR	D. D. R.
+DDT	D. D. T.
+DEA	D. E. A.
+DEC	D. E. C.
+DES	D. E. S.
+DFA	D. F. A.
+DFC	D. F. C.
+DFMO	D. F. M. O.
+DFS	D. F. S.
+DG	D. G.
+DGA	D. G. A.
+DGPT	D. G. P. T.
+DH	D. H.
+DHB	D. H. B.
+DHL	D. H. L.
+DIA	D. I. A.
+DIW	D. I. W.
+DJ	D. J.
+DJIA	D. J. I. A.
+DJP	D. J. P.
+DJS	D. J. S.
+DKB	D. K. B.
+DKM	D. K. M.
+DL	D. L.
+DLC	D. L. C.
+DLJ	D. L. J.
+DM	D. M.
+DMA	D. M. A.
+DMB	D. M. B.
+DMC	D. M. C.
+DMD	D. M. D.
+DME	D. M. E.
+DMI	D. M. I.
+DMS	D. M. S.
+DMW	D. M. W.
+DMZ	D. M. Z.
+DN	D. N.
+DNA	D. N. A.
+DNC	D. N. C.
+DNX	D. N. X.
+DOC	D. O. C.
+DOD	D. O. D.
+DOE	D. O. E.
+DOS	D. O. S.
+DOT	D. O. T.
+DP	D. P.
+DPC	D. P. C.
+DPG	D. P. G.
+DPL	D. P. L.
+DPP	D. P. P.
+DPS	D. P. S.
+DPT	D. P. T.
+Dr	Doctor
+DRG	D. R. G.
+DRI	D. R. I.
+DS	D. S.
+DSA	D. S. A.
+DSC	D. S. C.
+DSL	D. S. L.
+DSLT	D. S. L. T.
+DSM	D. S. M.
+DSP	D. S. P.
+DST	D. S. T.
+DTC	D. T. C.
+DTH	D. T. H.
+DTI	D. T. I.
+DV	D. V.
+DVFA	D. V. F. A.
+DWG	D. W. G.
+DX	D. X.
+DYR	D. Y. R.
+EA	E. A.
+EAC	E. A. C.
+EAL	E. A. L.
+EAS	E. A. S.
+EB	E. B.
+EBDC	E. B. D. C.
+EBRD	E. B. R. D.
+EBS	E. B. S.
+EC	E. C.
+ECC	E. C. C.
+ECD	E. C. D.
+ECI	E. C. I.
+ECL	E. C. L.
+ECPA	E. C. P. A.
+ECU	E. C. U.
+EDA	E. D. A.
+EDB	E. D. B.
+EDC	E. D. C.
+EDI	E. D. I.
+EDM	E. D. M.
+EDP	E. D. P.
+EDS	E. D. S.
+EDT	E. D. T.
+EEC	E. E. C.
+EECO	E. E. C. O.
+EEI	E. E. I.
+EEOC	E. E. O. C.
+EEP	E. E. P.
+EES	E. E. S.
+EESP	E. E. S. P.
+EF	E. F.
+EFA	E. F. A.
+EFC	E. F. C.
+EG	E. G.
+EGA	E. G. A.
+EI	E. I.
+EIA	E. I. A.
+EIB	E. I. B.
+EIC	E. I. C.
+EIP	E. I. P.
+EITC	E. I. T. C.
+EIU	E. I. U.
+ELN	E. L. N.
+EMC	E. M. C.
+EMEA	E. M. E. A.
+EMI	E. M. I.
+EMS	E. M. S.
+EMT	E. M. T.
+ENI	E. N. I.
+ENSR	E. N. S. R.
+EP	E. P.
+EPA	E. P. A.
+EPLF	E. P. L. F.
+EPO	E. M. O.
+EPO	E. P. O.
+EPRI	E. P. R. I.
+ERC	E. R. C.
+ERG	E. R. G.
+ERIS	E. R. I. S.
+ERM	E. R. M.
+ERO	E. R. O.
+ERS	E. R. S.
+ES	E. S.
+ESA	E. S. A.
+ESB	E. S. B.
+ESI	E. S. I.
+ESL	E. S. L.
+ESOP	E. S. O. P.
+ESP	E. S. P.
+ESPN	E. S. P. N.
+ESS	E. S. S.
+EST	E. S. T.
+ET	E. T.
+ETA	E. T. A.
+ETBE	E. T. B. E.
+ETS	E. T. S.
+EU	E. U.
+EUA	E. U. A.
+EWE	E. W. E.
+EXL	E. X. L.
+EXP	E. X. P.
+EZ	E. Z.
+FA	F. A.
+FAA	F. A. A.
+FAC	F. A. C.
+FADA	F. A. D. A.
+FAI	F. A. I.
+FAO	F. A. O.
+FARC	F. A. R. C.
+FAS	F. A. S.
+FASB	F. A. S. B.
+FAZ	F. A. Z.
+FBI	F. B. I.
+FBS	F. B. S.
+FC	F. C.
+FCA	F. C. A.
+FCB	F. C. B.
+FCC	F. C. C.
+FCD	F. C. D.
+FCMI	F. C. M. I.
+FDA	F. D. A.
+FDC	F. D. C.
+FDIC	F. D. I. C
+FDIC	F. D. I. C.
+FDN	F. D. N.
+FDP	F. D. P.
+FDR	F. D. R.
+FEA	F. E. A.
+FEC	F. E. C.
+FEMA	F. E. M. A.
+FERC	F. E. R. C.
+FF	F. F.
+FFA	F. F. A.
+FFB	F. F. B.
+FFP	F. F. P.
+FGH	F. G. H.
+FGIC	F. G. I. C.
+FH	F. H.
+FHA	F. H. A.
+FHAA	F. H. A. A.
+FHFB	F. H. F. B.
+FHLB	F. H. L. B.
+FHLBB	F. H. L. B. B.
+FHP	F. H. P.
+FIA	F. I. A.
+FIAC	F. I. A. C.
+FICA	F. I. C. A.
+FICO	F. I. C. O.
+FIFA	F. I. F. A.
+FII	F. I. I.
+FIP	F. I. P.
+FK	F. K.
+FKB	F. K. B.
+FKI	F. K. I.
+FL	F. L.
+FLA	F. L. A.
+FLX	F. L. X.
+FM	F. M.
+FMC	F. M. C.
+FMHA	F. M. H. A.
+FmHA	F. M. H. A.
+FMI	F. M. I.
+FMLN	F. M. L. N.
+FMR	F. M. R.
+FMS	F. M. S.
+FN	F. N.
+FNN	F. N. N.
+FNS	F. N. S.
+FOMC	F. O. M. C.
+FP	F. P.
+FPA	F. P. A.
+FPC	F. P. C.
+FPCO	F. P. C. O.
+FPL	F. P. L.
+FR	F. R.
+FRA	F. R. A.
+FS	F. S.
+FSA	F. S. A.
+FSB	F. S. B.
+FSC	F. S. C.
+FSD	F. S. D.
+FSIA	F. S. I. A.
+FSLIC	F. S. L. I. C.
+FSLN	F. S. L. N.
+FSX	F. S. X.
+FT	F. T.
+FTC	F. T. C.
+FTS	F. T. S.
+FTSE	F. T. S. E.
+FX	F. X.
+FYI	F. Y. I.
+GA	G. A.
+GAAP	G. A. A. P.
+GAC	G. A. C.
+GAF	G. A. F.
+GAO	G. A. O.
+GASB	G. A. S. B.
+GATT	G. A. T. T.
+GATX	G. A. T. X.
+GB	G. B.
+GBL	G. B. L.
+GBM	G. B. M.
+GBS	G. B. S.
+GC	G. C.
+GCA	G. C. A.
+GCC	G. C. C.
+GCI	G. C. I.
+GDM	G. D. M.
+GDP	G. D. P.
+GDR	G. D. R.
+GE	G. E.
+GEC	G. E. C.
+GECC	G. E. C. C.
+GF	G. F.
+GFI	G. F. I.
+GFT	G. F. T.
+GGK	G. G. K.
+GHF	G. H. F.
+GHKM	G. H. K. M.
+GHR	G. H. R.
+GHS	G. H. S.
+GHRF	G. H. R. F.
+GI	G. I.
+GIA	G. I. A.
+GIC	G. I. C.
+GIS	G. I. S.
+GK	G. K.
+GKN	G. K. N.
+GL	G. L.
+GLCM	G. L. C. M.
+GLI	G. L. I.
+GM	G. M.
+GMA	G. M. A.
+GMAC	G. M. A. C.
+GMBH	G. M. B. H.
+GMC	G. M. C.
+GMF	G. M. F.
+GMHC	G. M. H. C.
+GMN	G. M. N.
+GMT	G. M. T.
+GMTV	G. M. T. V.
+GNB	G. N. B.
+GNI	G. N. I.
+GNMA	G. N. M. A.
+GNP	G. N. P.
+GOP	G. O. P.
+GP	G. P.
+GPA	G. P. A.
+GPD	G. P. D.
+GPG	G. P. G.
+GPO	G. P. O.
+GPS	G. P. S.
+GPT	G. P. T.
+GPU	G. P. U.
+GQ	G. Q.
+GR	G. R.
+GRE	G. R. E.
+GRI	G. R. I.
+GRU	G. R. U.
+GS	G. S.
+GSA	G. S. A.
+GSD	G. S. D.
+GSI	G. S. I.
+GSL	G. S. L.
+GSP	G. S. P.
+GSS	G. S. S.
+GST	G. S. T.
+GSX	G. S. X.
+GT	G. T.
+GTA	G. T. A.
+GTC	G. T. C.
+GTE	G. T. E.
+GTECH	G. Tech
+GTG	G. T. G.
+GTI	G. T. I.
+GTS	G. T. S.
+GV	G. V.
+GW	G. W.
+GWC	G. W. C.
+GXE	G. X. E.
+HBJ	H. B. J.
+HBM	H. B. M.
+HBO	H. B. O.
+HCA	H. C. A.
+HCC	H. C. C.
+HCI	H. C. I.
+HCFA	H. C. F. A.
+HCFC	H. C. F. C.
+HCS	H. C. S.
+HD	H. D.
+HDL	H. D. L.
+HDM	H. D. M.
+HDTV	H. D. T. V.
+HEI	H. E. I.
+HF	H. F.
+HFC	H. F. C.
+HG	H. G.
+HGTV	H. G. T. V.
+HH	H. H.
+HHB	H. H. B.
+HHS	H. H. S.
+HILB	H. I. L. B.
+HIV	H. I. V.
+HK	H. K.
+HKSAR	H. K. S. A. R.
+HL	H. L.
+HLM	H. L. M.
+HLX	H. L. X.
+HMA	H. M. A.
+HMDA	H. M. D. A.
+HMG	H. M. G.
+HMO	H. M. O.
+HMS	H. M. S.
+HMSS	H. M. S. S.
+HN	H. N.
+HNSX	H. N. S. X.
+HNV	H. N. V.
+HP	H. P.
+HPB	H. P. B.
+HQ	H. Q.
+HR	H. R.
+HRB	H. R. B.
+HRE	H. R. E.
+HRI	H. R. I.
+HRS	H. R. S.
+HSA	H. S. A.
+HSBC	H. S. B. C.
+HSH	H. S. H.
+HSST	H. S. S. T.
+HSV	H. S. V.
+HT	H. T.
+HTLV	H. T. L. V.
+HWC	H. W. C.
+HZN	H. Z. N.
+IADB	I. A. D. B.
+IAE	I. A. E.
+IAEA	I. A. E. A.
+IAEC	I. A. E. C.
+IAFP	I. A. F. P.
+IAM	I. A. M.
+IATA	I. A. T. A.
+IB	I. B.
+IBA	I. B. A.
+IBAA	I. B. A. A.
+IBC	I. B. C.
+IBCA	I. B. C. A.
+IBES	I. B. E. S.
+IBEW	I. B. E. W.
+IBH	I. B. H.
+IBI	I. B. I.
+IBJ	I. B. J.
+IBM	I. B. M.
+IBP	I. B. P.
+IC	I. C.
+ICA	I. C. A.
+ICAO	I. C. A. O.
+ICBM	I. C. B. M.
+ICC	I. C. C.
+ICCO	I. C. C. O.
+ICEE	I. C. E. E.
+ICF	I. C. F.
+ICG	I. C. G.
+ICH	I. C. H.
+ICI	I. C. I.
+ICL	I. C. L.
+ICM	I. C. M.
+ICN	I. C. N.
+ICO	I. C. O.
+ICRP	I. C. R. P.
+ICSL	I. C. S. L.
+ID	I. D.
+IDA	I. D. A.
+IDB	I. D. B.
+IDC	I. D. C.
+IDD	I. D. D.
+IDF	I. D. F.
+IDG	I. D. G.
+IDI	I. D. I.
+IDS	I. D. S.
+IEA	I. E. A.
+IEC	I. E. C.
+IEJW	I. E. J. W.
+IFA	I. F. A.
+IFAR	I. F. A. R.
+IFB	I. F. B.
+IFC	I. F. C.
+IFE	I. F. E.
+IFF	I. F. F.
+IFI	I. F. I.
+IFO	I. F. O.
+IFR	I. F. R.
+IFRB	I. F. R. B.
+IG	I. G.
+IGB	I. G. B.
+IgG	I. g. G.
+IGI	I. G. I.
+IGT	I. G. T.
+IGX	I. G. X.
+IH	I. H.
+IHI	I. H. I.
+IIGS	I. I. G. S.
+IIS	I. I. S.
+IIT	I. I. T.
+IJ	I. J.
+IKEA	I. K. E. A.
+IL	I. L.
+ILA	I. L. A.
+ILC	I. L. C.
+ILGWU	I. L. G. W. U.
+ILO	I. L. O.
+ILS	I. L. S.
+IM	I. M.
+IMA	I. M. A.
+IMC	I. M. C.
+IMD	I. M. D.
+IMF	I. M. F.
+IMG	I. M. G.
+IMI	I. M. I.
+IMM	I. M. M.
+IMO	I. M. O.
+IMS	I. M. S.
+IMT	I. M. T.
+IMU	I. M. U.
+INA	I. N. A.
+INB	I. N. B.
+Inc	Incorporated
+IND	I. N. D.
+INF	I. N. F.
+ING	I. N. G.
+INI	I. N. I.
+INPO	I. N. P. O.
+INR	I. N. R.
+INS	I. N. S.
+Intl	International
+Intercorp	Intercorporation
+IOC	I. O. C.
+IOR	I. O. R.
+IOS	I. O. S.
+IOU	I. O. U.
+IP	I. P.
+IPC	I. P. C.
+IPE	I. P. E.
+IPFA	I. P. F. A.
+IPM	I. P. M.
+IPO	I. P. O.
+IPS	I. P. S.
+IQ	I. Q.
+IRA	I. R. A.
+IRI	I. R. I.
+IRNA	I. R. N. A.
+IROC	I. R. O. C.
+IRS	I. R. S.
+IRT	I. R. T.
+ISC	I. S. C.
+ISDN	I. S. D. N.
+ISE	I. S. E.
+ISI	I. S. I.
+ISL	I. S. L.
+ISM	I. S. M.
+ISO	I. S. O.
+ISS	I. S. S.
+ITA	I. T. A.
+ITC	I. T. C.
+ITG	I. T. G.
+ITN	I. T. N.
+ITT	I. T. T.
+ITV	I. T. V.
+IU	I. U.
+IUD	I. U. D.
+IUE	I. U. E.
+IUR	I. U. R.
+IVF	I. V. F.
+IVI	I. V. I.
+IVIG	I. V. I. G.
+IXL	I. X. L.
+IWA	I. W. A.
+JAL	J. A. L.
+JAMA	J. A. M. A.
+JATP	J. A. T. P.
+JBA	J. B. A.
+JC	J. C.
+JCB	J. C. B.
+JCP	J. C. P.
+JCS	J. C. S.
+JCT	J. C. T.
+JDS	J. D. S.
+JEC	J. E. C.
+JFA	J. F. A.
+JFK	J. F. K.
+JGC	J. G. C.
+JHM	J. H. M.
+JIT	J. I. T.
+JLG	J. L. G.
+JMB	J. M. B.
+JMR	J. M. R.
+JOA	J. O. A.
+JP	J. P.
+JPL	J. P. L.
+JPM	J. P. M.
+JR	J. R.
+JRA	J. R. A.
+JSP	J. S. P.
+JT	J. T.
+JTL	J. T. L.
+JTM	J. T. M.
+JTPA	J. T. P. A.
+JVC	J. V. C.
+JVP	J. V. P.
+JWD	J. W. D.
+JWP	J. W. P.
+JWT	J. W. T.
+KAL	K. A. L.
+KB	K. B.
+KBA	K. B. A.
+KBGS	K. B. G. S.
+KBS	K. B. S.
+KC	K. C.
+KCBS	K. C. B. S.
+KCP	K. C. P.
+KCS	K. C. S.
+KCST	K. C. S. T.
+KD	K. D.
+KDD	K. D. D.
+KDI	K. D. I.
+KETV	K. E. T. V.
+KF	K. F.
+KFC	K. F. C.
+KFF	K. F. F.
+KFW	K. F. W.
+KG	K. G.
+KGaA	K. G. a. A.
+KGB	K. G. B.
+KGF	K. G. F.
+KGMC	K. G. M. C.
+KH	K. H.
+KHD	K. H. D.
+KHJ	K. H. J.
+KIC	K. I. C.
+KIO	K. I. O.
+KK	K. K.
+KKB	K. K. B.
+KKR	K. K. R.
+KLA	K. L. A.
+KLM	K. L. M.
+KLP	K. L. P.
+KLUC	K. L. U. C.
+KMA	K. M. A.
+KMET	K. M. E. T.
+KMG	K. M. G.
+KMS	K. M. S.
+KMT	K. M. T.
+KMW	K. M. W.
+KN	K. N.
+KNON	K. N. O. N.
+KOP	K. O. P.
+KPAX	K. P. A. X.
+KPC	K. P. C.
+KPFK	K. P. F. K.
+KPMG	K. P. M. G.
+KPRC	K. P. R. C.
+KSI	K. S. I.
+KSZ	K. S. Z.
+KTF	K. T. F.
+KTM	K. T. M.
+KTWV	K. T. W. V.
+KV	K. V.
+KVIL	K. V. I. L.
+KW	K. W.
+KWU	K. W. U.
+KZKC	K. Z. K. C.
+LA	L. A.
+LB	L. B.
+LBJ	L. B. J.
+LBO	L. B. O.
+LBS	L. B. S.
+LCA	L. C. A.
+LCD	L. C. D.
+LCG	L. C. G.
+LCI	L. C. I.
+LCP	L. C. P.
+LDC	L. D. C.
+LDDS	L. D. D. S.
+LDI	L. D. I.
+LDL	L. D. L.
+LDP	L. D. P.
+LDS	L. D. S.
+LDX	L. D. X.
+LFB	L. F. B.
+LFC	L. F. C.
+LG	L. G.
+LGP	L. G. P.
+LH	L. H.
+LHS	L. H. S.
+LHX	L. H. X.
+LIC	L. I. C.
+LiFeS	L. i. F. e. S.
+LIG	L. I. G.
+LIN	L. I. N.
+LIPA	L. I. P. A.
+LISC	L. I. S. C.
+LJN	L. J. N.
+LL	L. L.
+LLC	L. L. C.
+LME	L. M. E.
+LMT	L. M. T.
+LN	L. N.
+LNG	L. N. G.
+LNR	L. N. R.
+LNS	L. N. S.
+LOF	L. O. F.
+LOR	L. O. R.
+LOT	L. O. T.
+LP	L. P.
+LPC	L. P. C.
+LPGA	L. P. G. A.
+LPL	L. P. L.
+LPP	L. P. P.
+LS	L. S.
+LSB	L. S. B.
+LSC	L. S. C.
+LSD	L. S. D.
+LSI	L. S. I.
+LSU	L. S. U.
+LT	L. T.
+LTCB	L. T. C. B.
+LTD	L. T. D.
+LTV	L. T. V.
+LTX	L. T. X.
+LVI	L. V. I.
+LVMH	L. V. M. H.
+LX	L. X.
+LY	L. Y.
+MAI	M. A. I.
+MB	M. B.
+MBA	M. B. A.
+MBAA	M. B. A. A.
+MBB	M. B. B.
+MBE	M. B. E.
+MBF	M. B. F.
+MBFR	M. B. F. R.
+MBH	M. B. H.
+MBI	M. B. I.
+MBIA	M. B. I. A.
+MBS	M. B. S.
+MC	M. C.
+MCA	M. C. A.
+MCC	M. C. C.
+MCCP	M. C. C. P.
+MCEG	M. C. E. G.
+MCI	M. C. I.
+MCM	M. C. M.
+MCN	M. C. N.
+MCO	M. C. O.
+MCP	M. C. P.
+MCS	M. C. S.
+MD	M. D.
+MDA	M. D. A.
+MDB	M. D. B.
+MDC	M. D. C.
+MDI	M. D. I.
+MDM	M. D. M.
+MDT	M. D. T.
+MEBA	M. E. B. A.
+MEI	M. E. I.
+MEK	M. E. K.
+MEM	M. E. M.
+MEPC	M. E. P. C.
+MFA	M. F. A.
+MFI	M. F. I.
+MFL	M. F. L.
+MFN	M. F. N.
+MFS	M. F. S.
+MGC	M. G. C.
+MGI	M. G. I.
+MGM	M. G. M.
+MH	M. H.
+MHA	M. H. A.
+MHC	M. H. C.
+MHI	M. H. I.
+MHP	M. H. P.
+MHQ	M. H. Q.
+MI	M. I.
+MIA	M. I. A.
+MICC	M. I. C. C.
+MIGA	M. I. G. A.
+MIM	M. I. M.
+MIP	M. I. P.
+MIPS	M. I. P. S.
+MIS	M. I. S.
+MIT	M. I. T.
+MITI	M. I. T. I.
+MK	M. K.
+MKI	M. K. I.
+ML	M. L.
+MLP	M. L. P.
+MLPI	M. L. P. I.
+MLS	M. L. S.
+MLX	M. L. X.
+MMAC	M. M. A. C.
+MMC	M. M. C.
+MMI	M. M. I.
+MMPI	M. M. P. I.
+MMR	M. M. R.
+MMS	M. M. S.
+MMWEC	M. M. W. E. C.
+MNC	M. N. C.
+MNet	M. Net
+MNX	M. N. X.
+MP	M. P.
+MPAA	M. P. A. A.
+MPB	M. P. B.
+MPLA	M. P. L. A.
+MPS	M. P. S.
+MPT	M. P. T.
+MPTP	M. P. T. P.
+MPV	M. P. V.
+MRC	M. R. C.
+MRCA	M. R. C. A.
+MRI	M. R. I.
+MRP	M. R. P.
+MRTA	M. R. T. A.
+MS	M. S.
+MSA	M. S. A.
+MSHA	M. S. H. A.
+MSI	M. S. I.
+MSL	M. S. L.
+MSM	M. S. M.
+MSOE	M. S. O. E.
+MSP	M. S. P.
+MSRB	M. S. R. B.
+MSU	M. S. U.
+MSX	M. S. X.
+MTA	M. T. A.
+MTB	M. T. B.
+MTBE	M. T. B. E.
+MTech	M. Tech
+MTI	M. T. I.
+MTM	M. T. M.
+MTR	M. T. R.
+MTS	M. T. S.
+MTU	M. T. U.
+MTV	M. T. V.
+MV	M. V.
+MVP	M. V. P.
+MVS	M. V. S.
+MX	M. X.
+NA	N. A.
+NAACP	N. Double A. C. P.
+NAC	N. A. C.
+NACA	N. A. C. A.
+NACM	N. A. C. M.
+NAD	N. A. D.
+NAEIR	N. A. E. I. R.
+NAEP	N. A. E. P.
+NAHB	N. A. H. B.
+NAIC	N. A. I. C.
+NAL	N. A. L.
+NALU	N. A. L. U.
+NAM	N. A. M.
+NAPAP	N. A. P. A. P.
+NAPM	N. A. P. M.
+NAR	N. A. R.
+NARFE	N. A. R. F. E.
+NAS	N. A. S.
+#NASA	N. A. S. A.
+NASD	N. A. S. D.
+NASSA	N. A. S. S. A.
+NATCA	N. A. T. C. A.
+NAV	N. A. V.
+NBA	N. B. A.
+NBC	N. B. C.
+NBD	N. B. D.
+NBER	N. B. E. R.
+NBI	N. B. I.
+NBO	N. B. O.
+NBS	N. B. S.
+NC	N. C.
+NCA	N. C. A.
+NCAA	N. C. A. A.
+NCB	N. C. B.
+NCC	N. C. C.
+NCI	N. C. I.
+NCIF	N. C. I. F.
+NCMS	N. C. M. S.
+NCNB	N. C. N. B.
+NCR	N. C. R.
+NCTA	N. C. T. A.
+NDF	N. D. F.
+NDI	N. D. I.
+NDP	N. D. P.
+NEA	N. E. A.
+NEC	N. E. C.
+NEH	N. E. H.
+NEI	N. E. I.
+NESB	N. E. S. B.
+NETAAC	N. E. T. A. A. C.
+NFA	N. F. A.
+NFC	N. F. C.
+NFIB	N. F. I. B.
+NFIC	N. F. I. C.
+NFL	N. F. L.
+NFPA	N. F. P. A.
+NFS	N. F. S.
+NFSW	N. F. S. W.
+NGL	N. G. L.
+NH	N. H.
+NHK	N. H. K.
+NHL	N. H. L.
+NHS	N. H. S.
+NHTSA	N. H. T. S. A.
+NI	N. I.
+NIA	N. I. A.
+NIC	N. I. C.
+NIDA	N. I. D. A.
+NIH	N. I. H.
+NIMH	N. I. M. H.
+NIOSH	N. I. O. S. H.
+NIS	N. I. S.
+NJ	N. J.
+NKF	N. K. F.
+NKK	N. K. K.
+NKVD	N. K. V. D.
+NL	N. L.
+NLD	N. L. D.
+NLI	N. L. I.
+NLM	N. L. M.
+NLO	N. L. O.
+NLRB	N. L. R. B.
+NM	N. M.
+NME	N. M. E.
+NMP	N. M. P.
+NMS	N. M. S.
+NMTBA	N. M. T. B. A.
+NMU	N. M. U.
+NOAA	N. O. A. A.
+NOX	N. O. X.
+NPA	N. P. A.
+NPC	N. P. C.
+NPD	N. P. D.
+NPM	N. P. M.
+NRA	N. R. A.
+NRC	N. R. C.
+NRDC	N. R. D. C.
+NRECA	N. R. E. C. A.
+NRM	N. R. M.
+NS	N. S.
+NSA	N. S. A.
+NSC	N. S. C.
+NSF	N. S. F.
+NSM	N. S. M.
+NSPA	N. S. P. A.
+NT	N. T.
+NTC	N. T. C.
+NTG	N. T. G.
+NTIA	N. T. I. A.
+NTN	N. T. N.
+NTSB	N. T. S. B.
+NTT	N. T. T.
+NTX	N. T. X.
+NUI	N. U. I.
+NUM	N. U. M.
+NUS	N. U. S.
+NV	N. V.
+NVF	N. V. F.
+NW	N. W.
+NWA	N. W. A.
+NWQ	N. W. Q.
+NX	N. X.
+NY	N. Y.
+NYCB	N. Y. C. B.
+NYCE	N. Y. C. E.
+NYFE	N. Y. F. E.
+NYSE	N. Y. S. E.
+NYT	N. Y. T.
+NYU	N. Y. U.
+NZI	N. Z. I.
+OAG	O. A. G.
+OAS	O. A. S.
+OASDI	O. A. S. D. I.
+OAT	O. A. T.
+OCC	O. C. C.
+OCE	O. C. E.
+OCR	O. C. R.
+OCS	O. C. S.
+OCU	O. C. U.
+ODS	O. D. S.
+OEC	O. E. C.
+OECD	O. E. C. D.
+OED	O. E. D.
+OEL	O. E. L.
+OEM	O. E. M.
+OEX	O. E. X.
+OG	O. G.
+OIRA	O. I. R. A.
+OIS	O. I. S.
+OK	O. K.
+OKC	O. K. C.
+OMB	O. M. B.
+OMI	O. M. I.
+OMV	O. M. V.
+ONG	O. N. G.
+OPIC	O. P. I. C.
+OPM	O. P. M.
+ORI	O. R. I.
+ORS	O. R. S.
+OS	O. S.
+OSF	O. S. F.
+OSI	O. S. I.
+OSS	O. S. S.
+OTA	O. T. A.
+OTC	O. T. C.
+OTF	O. T. F.
+OTN	O. T. N.
+OTS	O. T. S.
+OTV	O. T. V.
+OV	O. V.
+PA	P. A.
+PAE	P. A. E.
+PAK	P. A. K.
+PATC	P. A. T. C.
+PB	P. B.
+PBA	P. B. A.
+PBGC	P. B. G. C.
+PBHG	P. B. H. G.
+PBI	P. B. I.
+PBR	P. B. R.
+PBS	P. B. S.
+PBX	P. B. X.
+PC	P. C.
+PCA	P. C. A.
+PCB	P. C. B.
+PCC	P. C. C.
+PCE	P. C. E.
+PCI	P. C. I.
+PCjr	P. C. Junior
+PCL	P. C. L.
+PCM	P. C. M.
+PCMCIA	P. C. M. C. I. A.
+PCN	P. C. N.
+PCP	P. C. P.
+PCR	P. C. R.
+PCS	P. C. S.
+PCW	P. C. W.
+PD	P. D.
+PDA	P. D. A.
+PDF	P. D. F.
+PDI	P. D. I.
+PDLA	P. D. L. A.
+PDR	P. D. R.
+PDT	P. D. T.
+PE	P. E.
+PECC	P. E. C. C.
+PF	P. F.
+PFM	P. F. M.
+PG	P. G.
+PGA	P. G. A.
+PGH	P. G. H.
+PhD	P. H. D.
+Ph.D	P. H. D.
+Ph.D.s	P. H. D.s
+Ph.Ds	P. H. D.s
+PhDs	P. H. D.s
+PHH	P. H. H.
+PHLCorp	P. H. L. Corporation
+PHM	P. H. M.
+PHP	P. H. P.
+PHPO	P. H. P. O.
+PI	P. I.
+PIK	P. I. K.
+PIP	P. I. P.
+PIR	P. I. R.
+PIW	P. I. W.
+PL	P. L.
+PLC	P. L. C.
+PLE	P. L. E.
+PLM	P. L. M.
+PLO	P. L. O.
+PM	P. M.
+PMA	P. M. A.
+PMC	P. M. C.
+PMDB	P. M. D. B.
+PMI	P. M. I.
+PMS	P. M. S.
+PMT	P. M. T.
+PNB	P. N. B.
+PNC	P. N. C.
+PNG	P. N. G.
+PNM	P. N. M.
+PNOC	P. N. O. C.
+POW	P. O. W.
+PP	P. P.
+PPD	P. P. D.
+PPG	P. P. G.
+PPI	P. P. I.
+PPM	P. P. M.
+PPO	P. P. O.
+PPP	P. P. P.
+PQQ	P. Q. Q.
+PR	P. R.
+PRB	P. R. B.
+PRC	P. R. C.
+PRD	P. R. D.
+PRI	P. R. I.
+PRSA	P. R. S. A.
+Pvt	Private
+PRK	P. R. K.
+PRP	P. R. P.
+PS	P. S.
+PSA	P. S. A.
+PSC	P. S. C.
+PSE	P. S. E.
+PSG	P. S. G.
+PSI	P. S. I.
+PSNH	P. S. N. H.
+PSR	P. S. R.
+PST	P. S. T.
+PSUM	P. S. U. M.
+PT	P. T.
+PTA	P. T. A.
+PTI	P. T. I.
+PTL	P. T. L.
+PTT	P. T. T.
+PUC	P. U. C.
+PV	P. V.
+PVC	P. V. C.
+PW	P. W.
+PWA	P. W. A.
+PWS	P. W. S.
+PX	P. X.
+PYA	P. Y. A.
+QB	Q. B.
+QDE	Q. D. E.
+QE	Q. E.
+QFB	Q. F. B.
+QMS	Q. M. S.
+QO	Q. O.
+QVC	Q. V. C.
+RAC	R. A. C.
+RAF	R. A. F.
+RAI	R. A. I.
+RB	R. B.
+RBC	R. B. C.
+RC	R. C.
+RCA	R. C. A.
+RCI	R. C. I.
+RCM	R. C. M.
+RD	R. D.
+RDF	R. D. F.
+RDP	R. D. P.
+REIT	R. E. I. T.
+RF	R. F.
+RFC	R. F. C.
+RFD	R. F. D.
+RFE	R. F. E.
+RFI	R. F. I.
+RFTV	R. F. T. V.
+RG	R. G.
+RHI	R. H. I.
+RHM	R. H. M.
+RI	R. I.
+RJ	R. J.
+RJR	R. J. R.
+RKO	R. K. O.
+RL	R. L.
+RLC	R. L. C.
+RLI	R. L. I.
+RLR	R. L. R.
+RMC	R. M. C.
+RMI	R. M. I.
+RMJ	R. M. J.
+RMS	R. M. S.
+RMV	R. M. V.
+RNA	R. N. A.
+RNC	R. N. C.
+RO	R. O.
+ROA	R. O. A.
+ROC	R. O. C.
+ROTC	R. O. T. C.
+RPA	R. P. A.
+RPM	R. P. M.
+RREEF	R. R. E. E. F.
+RS	R. S.
+RSC	R. S. C.
+RSCG	R. S. C. G.
+RSI	R. S. I.
+RSO	R. S. O.
+RSV	R. S. V.
+RT	R. T.
+RTBF	R. T. B. F.
+RTC	R. T. C.
+RTE	R. T. E.
+RTHK	R. T. H. K.
+RTL	R. T. L.
+RTM	R. T. M.
+RTS	R. T. S.
+RTZ	R. T. Z.
+RU	R. U.
+RUC	R. U. C.
+RV	R. V.
+RWE	R. W. E.
+RX	R. X.
+SA	S. A.
+SAA	S. A. A.
+SAB	S. A. B.
+SACC	S. A. C. C.
+SACP	S. A. C. P.
+SAI	S. A. I.
+SAL	S. A. L.
+SALP	S. A. L. P.
+SAO	S. A. O.
+SAPC	S. A. P. C.
+SAS	S. A. S.
+SAT	S. A. T.
+SB	S. B.
+SBA	S. B. A.
+SBC	S. B. C.
+SBCI	S. B. C. I.
+SBIC	S. B. I. C.
+SBIR	S. B. I. R.
+SBK	S. B. K.
+SBS	S. B. S.
+SC	S. C.
+SCA	S. C. A.
+SCE	S. C. E.
+SCEcorp	S. C. E. Corporation
+SCI	S. C. I.
+SCM	S. C. M.
+SD	S. D.
+SDA	S. D. A.
+SDC	S. D. C.
+SDG	S. D. G.
+SDI	S. D. I.
+SDP	S. D. P.
+SDR	S. D. R.
+SDRC	S. D. R. C.
+SDS	S. D. S.
+SE	S. E.
+SEC	S. E. C.
+SEEQ	S. E. E. Q.
+SEI	S. E. I.
+SEL	S. E. L.
+SEM	S. E. M.
+SES	S. E. S.
+SF	S. F.
+SFC	S. F. C.
+SFE	S. F. E.
+SFN	S. F. N.
+SFO	S. F. O.
+SGB	S. G. B.
+SGC	S. G. C.
+SGI	S. G. I.
+SGS	S. G. S.
+SH	S. H.
+SHL	S. H. L.
+SHV	S. H. V.
+SI	S. I.
+SIA	S. I. A.
+SIB	S. I. B.
+SIBV	S. I. B. V.
+SIPC	S. I. P. C.
+SIV	S. I. V.
+SJNB	S. J. N. B.
+SK	S. K.
+SKF	S. K. F.
+SKK	S. K. K.
+SL	S. L.
+SLA	S. L. A.
+SLH	S. L. H.
+SLM	S. L. M.
+SLR	S. L. R.
+SMC	S. M. C.
+SME	S. M. E.
+SMES	S. M. E. S.
+SMR	S. M. R.
+SMS	S. M. S.
+SMU	S. M. U.
+SMUD	S. M. U. D.
+SNC	S. N. C.
+SNCF	S. N. C. F.
+SNET	S. N. E. T.
+SNIA	S. N. I. A.
+SNL	S. N. L.
+SNPE	S. N. P. E.
+SOES	S. O. E. S.
+SOS	S. O. S.
+SP	S. P.
+SPD	S. P. D.
+SPE	S. P. E.
+SPEP	S. Pep
+SPG	S. P. G.
+SPI	S. P. I.
+SPS	S. P. S.
+SPSF	S. P. S. F.
+SPX	S. P. X.
+SpA	Company
+S.p.A	Company
+SQL	S. Q. L.
+SR	S. R.
+SRI	S. R. I.
+SRK	S. R. K.
+SRL	S. R. L.
+SRO	S. R. O.
+SRS	S. R. S.
+SS	S. S.
+SSA	S. S. A.
+SSB	S. S. B.
+SSBI	S. S. B. I.
+SSC	S. S. C.
+SSI	S. S. I.
+SSMC	S. S. M. C.
+SSN	S. S. N.
+SSP	S. S. P.
+SST	S. S. T.
+STC	S. T. C.
+Ste	Saint
+STS	S. T. S.
+SVP	S. V. P.
+SX	S. X.
+TA	T. A.
+TB	T. B.
+TBA	T. B. A.
+TBC	T. B. C.
+TBF	T. B. F.
+TBG	T. B. G.
+TBK	T. B. K.
+TBN	T. B. N.
+TBS	T. B. S.
+TBWA	T. B. W. A.
+TC	T. C.
+TCA	T. C. A.
+TCBY	T. C. B. Y.
+TCC	T. C. C.
+TCF	T. C. F.
+TCI	T. C. I.
+TCMP	T. C. M. P.
+TCP	T. C. P.
+TCS	T. C. S.
+TCU	T. C. U.
+TCW	T. C. W.
+TD	T. D.
+TDD	T. D. D.
+TDK	T. D. K.
+TDU	T. D. U.
+TE	T. E.
+TEC	T. E. C.
+TEP	T. E. P.
+TF	T. F.
+TFBA	T. F. B. A.
+TFD	T. F. D.
+TFF	T. F. F.
+TFR	T. F. R.
+TGI	T. G. I.
+TGL	T. G. L.
+TGWU	T. G. W. U.
+THA	T. H. A.
+THI	T. H. I.
+THT	T. H. T.
+TI	T. I.
+TIAA	T. I. A. A.
+TII	T. I. I.
+TIL	T. I. L.
+TIMI	T. I. M. I.
+TJ	T. J.
+TJX	T. J. X.
+TKR	T. K. R.
+TLC	T. L. C.
+TM	T. M.
+TMC	T. M. C.
+TMI	T. M. I.
+TMIC	T. M. I. C.
+TMK	T. M. K.
+TMOC	T. M. O. C.
+TNA	T. N. A.
+TNF	T. N. F.
+TNM	T. N. M.
+TNP	T. N. P.
+TNT	T. N. T.
+TOA	T. O. A.
+TPA	T. P. A.
+tPA	t. P. A.
+TPF	T. P. F.
+TPI	T. P. I.
+TPS	T. P. S.
+TR	T. R.
+TRC	T. R. C.
+TRE	T. R. E.
+TRO	T. R. O.
+TRS	T. R. S.
+TRT	T. R. T.
+TRW	T. R. W.
+TS	T. S.
+TSA	T. S. A.
+TSB	T. S. B.
+TSE	T. S. E.
+TSF	T. S. F.
+TSI	T. S. I.
+TSO	T. S. O.
+TSSU	T. S. S. U.
+TTAC	T. T. A. C.
+TTAPS	T. T. A. P. S.
+TU	T. U.
+TV	T. V.
+TVA	T. V. A.
+TVI	T. V. I.
+TVS	T. V. S.
+TVSM	T. V. S. M.
+TVX	T. V. X.
+TW	T. W.
+TWA	T. W. A.
+TX	T. X.
+TXI	T. X. I.
+TXL	T. X. L.
+TXO	T. X. O.
+UA	U. A.
+UAE	U. A. E.
+UAL	U. A. L.
+UAP	U. A. P.
+UAW	U. A. W.
+UBAF	U. B. A. F.
+UBS	U. B. S.
+UC	U. C.
+UCLA	U. C. L. A.
+UCLAF	U. C. L. A. F.
+UCSD	U. C. S. D.
+UCSF	U. C. S. F.
+UD	U. D.
+UDAG	U. D. A. G.
+UDC	U. D. C.
+UDF	U. D. F.
+UEI	U. E. I.
+UFO	U. F. O.
+UFT	U. F. T.
+UFW	U. F. W.
+UGI	U. G. I.
+UH	U. H.
+UHF	U. H. F.
+UHL	U. H. L.
+UI	U. I.
+UIC	U. I. C.
+UIS	U. I. S.
+UJA	U. J. A.
+UK	U. K.
+UKI	U. K. I.
+ULI	U. L. I.
+UMBC	U. M. B. C.
+UMC	U. M. C.
+UMNO	U. M. N. O.
+UMTA	U. M. T. A.
+UMW	U. M. W.
+UNAM	U. N. A. M.
+UNC	U. N. C.
+UNCF	U. N. C. F.
+UNDP	U. N. D. P.
+UNHCR	U. N. H. C. R.
+UNLV	U. N. L. V.
+UNR	U. N. R.
+UOP	U. O. P.
+UPC	U. P. C.
+UPI	U. P. I.
+UPS	U. P. S.
+URS	U. R. S.
+URW	U. R. W.
+US	U. S.
+USA	U. S. A.
+U.S.A	U. S. A.
+USAA	U. S. A. A.
+USACafes	U. S. A. Cafes
+USADirect	U. S. A. Direct
+USAir	U. S. Air
+USC	U. S. C.
+USCB	U. S. C. B.
+USDA	U. S. D. A.
+USF	U. S. F.
+USFL	U. S. F. L.
+USG	U. S. G.
+USH	U. S. H.
+USI	U. S. I.
+USIA	U. S. I. A.
+USLico	U. S. Lico
+USLife	U. S. Life
+USO	U. S. O.
+USOC	U. S. O. C.
+Uspci	U. S. P. C. I.
+USPS	U. S. P. S.
+USS	U. S. S.
+USSC	U. S. S. C.
+USSR	U. S. S. R.
+UST	U. S. T.
+USW	U. S. W.
+USX	U. S. X.
+UT	U. T.
+UTA	U. T. A.
+UTC	U. T. C.
+UTL	U. T. L.
+UTU	U. T. U.
+UV	U. V.
+UX	U. X.
+VA	V. A.
+VAAP	V. A. A. P.
+VAD	V. A. D.
+VAN	V. A. N.
+VBI	V. B. I.
+VC	V. C.
+VCI	V. C. I.
+VCR	V. C. R.
+VCRS	V. C. R. S.
+VCS	V. C. S.
+VD	V. D.
+VDT	V. D. T.
+VF	V. F.
+VFW	V. F. W.
+VG	V. G.
+VGA	V. G. A.
+VH	V. H.
+VHA	V. H. A.
+VHF	V. H. F.
+VHL	V. H. L.
+VHS	V. H. S.
+VIP	V. I. P.
+VIR	V. I. R.
+VISN	V. I. S. N.
+VJN	V. J. N.
+VLI	V. L. I.
+VLSI	V. L. S. I.
+VM	V. M.
+VMS	V. M. S.
+VMX	V. M. X.
+VNA	V. N. A.
+VNR	V. N. R.
+VNU	V. N. U.
+VO	V. O.
+VOA	V. O. A.
+VOR	V. O. R.
+VP	V. P.
+VPI	V. P. I.
+VPT	V. P. T.
+VQT	V. Q. T.
+VR	V. R.
+VRA	V. R. A.
+VS	V. S.
+VSAT	V. S. A. T.
+VSB	V. S. B.
+VTC	V. T. C.
+VTR	V. T. R.
+VTX	V. T. X.
+VW	V. W.
+VWR	V. W. R.
+WABC	W. A. B. C.
+WAFA	W. A. F. A.
+WASP	W. A. S. P.
+WATS	W. A. T. S.
+WB	W. B.
+WBA	W. B. A.
+WBAI	W. B. A. I.
+WBBM	W. B. B. M.
+WBZ	W. B. Z.
+WCBS	W. C. B. S.
+WCI	W. C. I.
+WCIX	W. C. I. X.
+WCK	W. C. K.
+WCRS	W. C. R. S.
+WCVB	W. C. V. B.
+WD	W. D.
+WDB	W. D. B.
+WEFA	W. E. F. A.
+WEG	W. E. G.
+WestLB	West L. B.
+WEU	W. E. U.
+WFAN	W. F. A. N.
+WFBQ	W. F. B. Q.
+WFC	W. F. C.
+WFIA	W. F. I. A.
+WFLA	W. F. L. A.
+WFRR	W. F. R. R.
+WFXT	W. F. X. T.
+WGA	W. G. A.
+WGBH	W. G. B. H.
+WGC	W. G. C.
+WGHP	W. G. H. P.
+WGMS	W. G. M. S.
+WGN	W. G. N.
+WHAS	W. H. A. S.
+WHBQ	W. H. B. Q.
+WIC	W. I. C.
+WITI	W. I. T. I.
+WJBK	W. J. B. K.
+WJW	W. J. W.
+WKYS	W. K. Y. S.
+WLR	W. L. R.
+WM	W. M.
+WMAQ	W. M. A. Q.
+WMG	W. M. G.
+WMMS	W. M. M. S.
+WMS	W. M. S.
+WNAC	W. N. A. C.
+WNBC	W. N. B. C.
+WNCN	W. N. C. N.
+WNET	W. N. E. T.
+WNEW	W. N. E. W.
+WNS	W. N. S.
+WNW	W. N. W.
+WNYC	W. N. Y. C.
+WNYW	W. N. Y. W.
+WOJB	W. O. J. B.
+WOMC	W. O. M. C.
+WOR	W. O. R.
+WPA	W. P. A.
+WPBF	W. P. B. F.
+WPGH	W. P. G. H.
+WPIX	W. P. I. X.
+WPP	W. P. P.
+WPPSS	W. P. P. S. S.
+WQHT	W. Q. H. T.
+WQTV	W. Q. T. V.
+WQUE	W. Q. U. E.
+WR	W. R.
+WRKO	W. R. K. O.
+WROR	W. R. O. R.
+WSBK	W. S. B. K.
+WSCV	W. S. C. V.
+WSGP	W. S. G. P.
+WSJ	W. S. J.
+WSVN	W. S. V. N.
+WTBS	W. T. B. S.
+WTC	W. T. C.
+WTD	W. T. D.
+WTI	W. T. I.
+WTLV	W. T. L. V.
+WTO	W. T. O.
+WTTV	W. T. T. V.
+WTVJ	W. T. V. J.
+WTVT	W. T. V. T.
+WTXF	W. T. X. F.
+WW	W. W.
+WWII	W. W. I. I.
+WWL	W. W. L.
+WWOR	W. W. O. R.
+WXRK	W. X. R. K.
+WYLD	W. Y. L. D.
+WYNY	W. Y. N. Y.
+WZTV	W. Z. T. V.
+XA	X. A.
+XE	X. E.
+XJ	X. J.
+XL	X. L.
+XMP	X. M. P.
+XP	X. P.
+XR	X. R.
+XT	X. T.
+XTC	X. T. C.
+XYZ	X. Y. Z.
+YMCA	Y. M. C. A.
+YSL	Y. S. L.
+YTT	Y. T. T.
+YWCA	Y. W. C. A.
+ZCB	Z. C. B.
+ZDF	Z. D. F.
+ZMI	Z. M. I.
+ZR	Z. R.
+ZTS	Z. T. S.
+ZX	Z. X.
+mm	m. m.
+mg	m. g.
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
new file mode 100755
index 00000000000..f22684c5742
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
@@ -0,0 +1,465 @@
+#!/usr/bin/perl
+# $Id: abbrproc.perl,v 1.3 1996/08/21 20:05:09 robertm Rel $
+###############################################################################
+# This software is being provided to you, the LICENSEE, by the Massachusetts  #
+# Institute of Technology (M.I.T.) under the following license.  By           #
+# obtaining, using and/or copying this software, you agree that you have      #
+# read, understood, and will comply with these terms and conditions:          #
+#                                                                             #
+# Permission to use, copy, modify and distribute, including the right to      #
+# grant others the right to distribute at any tier, this software and its     #
+# documentation for any purpose and without fee or royalty is hereby granted, #
+# provided that you agree to comply with the following copyright notice and   #
+# statements, including the disclaimer, and that the same appear on ALL       #
+# copies of the software and documentation, including modifications that you  #
+# make for internal use or for distribution:                                  #
+#                                                                             #
+# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
+# reserved.                                                                   #
+#                                                                             #
+# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
+# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
+# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
+# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
+# TRADEMARKS OR OTHER RIGHTS.                                                 #
+#                                                                             #
+# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
+# used in advertising or publicity pertaining to distribution of the          #
+# software.  Title to copyright in this software and any associated           #
+# documentation shall at all times remain with M.I.T., and USER agrees to     #
+# preserve same.                                                              #
+###############################################################################
+
+# abbreviation preprocessor for WSJ
+# assumes 1 sentence per line
+#
+# 1. map "x.y." -> "x. y."
+# 2. convert Roman numerals with appropriate left context into cardinal no.s
+# 3. expand abbreviations and word translations
+#	expands remaining Roman numerals into ordinal no.s
+# 4. map isolated letters: "x" -> "x."
+
+# Minor modifications by David Graff, Linguistic Data Consortium, in
+# preparation for publishing on cdrom;  Aug. 11, 1994.
+
+# Major modifications by Robert MacIntyre, LDC, attempting to improve
+# performance (~50% speedup), in preparation of Broadcast News material,
+# August 1996.
+
+
+$file="$ENV{HOME}/bc-news/bin/abbrlist";		# default abbreviation file
+
+for($i=0,$j=0;$i<=$#ARGV;$i++)
+{	if($ARGV[$i] =~ /^-/)
+	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
+		else {&perr("illegal flag: $ARGV[$i]");}
+	}
+	else
+	{ #	if($file) {&perr("multiple file arg");}
+		$file=$ARGV[i];
+	}
+}
+@ARGV=();
+if(!file) {&perr("no abbreviation file specified"); }
+
+if(!open(FILE,$file)) {&perr("cannot open abbreviation file"); }
+while(<FILE>)
+{	if(/^#/) {next;}	# comment
+	s/\n//;
+	if(!$_) {next;}		# blank
+	$y=$_;
+	s/^(\S+)\s+//;		# extract 1st word
+	$x=$1;
+	if(!$x) {&perr("no word: $y");}
+	if(!$_) {&perr("no value: $y");}
+
+	if($x =~ /^\*r/)		# left context for roman numeral
+	{	if(!/^[a-zA-Z]{2,}$/)
+			{&perr("illegal roman: $x");}
+		tr/a-z/A-Z/;		# map to UC
+		$romanlc{$_}=1;
+	}
+	elsif($x =~ /\.$/)			# abbreviations
+	{	if($x !~ /^[a-zA-Z][a-zA-Z\.]+\.$/)
+			{&perr("illegal abbreviation: $x");}
+		$x =~ s/\.$//;
+		$abbrev{$x}=$_;
+		if($x =~ /[a-z]/)
+		{	$x =~ tr/a-z/A-Z/;	#UC version
+			tr/a-z/A-Z/;
+			$abbrev{$x}=$_;
+		}
+		#if(length($x)>$maxabl) {$maxabl=length($x);}
+	}
+	else				# translations
+	{	if($x !~ /^[a-zA-Z\.&\/-]+[a-zA-Z]$/)
+			{&perr("illegal translation: $x");}
+		$trans{$x}=$_;
+		if($x =~ /[a-z]/)
+		{	$x =~ tr/a-z/A-Z/;	#UC version
+			tr/a-z/A-Z/;
+			$trans{$x}=$_;
+		}
+		#if(length($x)>$maxtrl) {$maxtrl=length($x);}
+	}
+	$n++;
+}
+#if($vflg) {print STDERR "$n lines read from file\n";}
+
+&setupRoman;
+
+while(<>)
+{ ###########################  abbrevproc ####################################
+
+    # pass SGML as is
+    if (/^<\/?[spa]/)
+    {
+	print;
+	next;
+    }
+    chop;
+
+
+    s/&/ & /g;			# &
+    s=/= / =g;			# /
+    s/ - / -- /g;		# save (long) dashes
+    s/\b(-+)\b/ $1 /g;		# -, --, etc. in words
+    s/([^-\s])(-+)([^-\s])/$1 $2 $3/g;
+
+    if(/_/)
+    {
+	&perr2("removing illegal underscores (_) in:\n $_\n");
+	s/_//g;
+    }
+
+    @input = split(/\s+/);
+    @output=();
+    for($field=0;$field<=$#input;$field++)
+    {
+	$_ = $input[$field];
+	# if($vflg) {print "in: $_\n";}
+
+	s/^(\W*)//;		# strip front
+	$front=$1;
+
+	s/(\W*)$//;		# strip back
+	$back=$1;
+	if(/\.?\'[sS]$/)		# possessive
+	{
+	    s/(\.?\'[sS])$//;
+	    $back="$1$back";
+	}
+	elsif (/^[A-Z]+s$/)	# eg Cs or Xs
+	{
+	    s/s$//;
+	    $back="_s$back";
+	}
+
+	$ptbkflg = ($back =~ /^\./);
+
+	#if($vflg) {print "f=$front, m=$_, b=$back\n";}
+
+
+	# Roman numerals
+	if(/^[IVX]{1,6}$/ && $front eq "" && $field>0 &&
+	   ($x=&geto()))
+	{
+	    $x =~ tr/a-z/A-Z/;	# map lc to UC
+	    $x =~ s/^\W//;	   # strip initial punct from lc
+	    if($romanlc{$x})	# left context check
+	    {
+		if($front) 
+		{
+		    &pusho($front);
+		    if($front !~ /[\w]$/) {$appendflg=1;}
+		}
+
+		if ($x=$Roman{$_})
+		{
+		    &pusho($x);
+		}
+		else
+		{
+		    &perr2("illegal roman: $_");
+		    &pusho($_);
+		}
+
+		if($back)
+		{
+		    if($back !~ /^[\w]/) {&appendo($back);}
+		    else {&pusho($back);}
+		}
+		next;
+	    }
+				
+	}
+
+
+	# St. or St ["Street" vs. "Saint"]
+			if($_ eq "St")
+			{	$back =~ s/^\.//;
+				if($front ne "" && $back ne "")
+				{	&perr2("Cannot resove St.: $input[$field-1] $input[$field] $input[$field+1]");
+					$x=Street;	# Wild guess
+				}
+				elsif($front) { $x="Saint"; }
+				elsif($back) { $x="Street"; }
+				elsif($input[$field-1] !~ /^[A-Z]/
+					&& $input[$field+1] =~ /^[A-Z]/)
+					{ $x = "Saint"; }
+				elsif($input[$field-1] =~ /^[A-Z]/
+					&& $input[$field+1] !~ /^[A-Z]/)
+					{ $x = "Street"; }
+
+				elsif(!$back && $input[$field+1] =~ /^[A-Z]/)
+					{ $x = "Saint"; }
+				elsif(!$back && $input[$field+1] eq '-' &&
+					$input[$field+2] =~ /^[A-Z]/)
+					{ $x = "Saint"; }
+				else
+				{	&perr2("Cannot resove St.: $input[$field-1] $input[$field] $input[$field+1]");
+					$x=Street;	# Wild guess
+				}
+
+
+				if($front) 
+				{	&pusho($front);
+					if($front !~ /[\w]$/) {$appendflg=1;}
+				}
+	
+				&pusho($x);
+
+				if($back)
+				{	if($back !~ /^[\w]/) {&appendo($back);}
+					else {&pusho($back);}
+				}
+				next;
+			}
+
+	# abbreviations (end with .)
+			if($ptbkflg && ($x=$abbrev{$_}))
+			{	
+					if($front) 
+					{	&pusho($front);
+						if($front !~ /[\w]$/)
+							{$appendflg=1;}
+					}
+	
+					&pusho($x);
+					
+					if($field<$#input || $back =~ /[!?]/)
+						{ $back =~ s/^\.//; }	# rm .
+					else			# end of sent
+					{	$back =~ s/^\.(\'s)/$1./;
+						if($back =~ /\..*\./) # 2 dots
+						      {$back=~s/\.([^\.]*)/$1/;}
+					}
+
+					if($back)
+					{	if($back !~ /^[\w]/)
+							{&appendo($back);}
+						else {&pusho($back);}
+					}
+					next;
+				
+			}
+
+	# translations (do not end with .)
+			# first merge multi-token translations
+			if($input[$field+1] =~ /^[-\/&]$/ && $back eq "")
+			{	$x=$input[$field+2];
+				$x =~ s/(\W*)$//;
+				$xback=$1;
+				if($x =~ /\.?\'[sS]$/)		# possessive
+				{	$x =~ s/(\.?\'[sS])$//;
+					$xback="$1$xback";
+				}
+				elsif ($x =~ /^[A-Z]+s$/)	# eg Cs or Xs
+				{	$x =~ s/s$//;
+					$xback="_s$xback";
+				}
+				if($trans{"$_$input[$field+1]$x"})   # eg. AT&T
+				{	$_="$_$input[$field+1]$x";
+					$field+=2;
+
+					$back=$xback;
+					$ptbkflg = ($back =~ /^\./);
+				}
+			}
+			# then see if we have a translation
+			if ($x=$trans{$_})
+			{	if($front)
+				{	&pusho($front);
+					if($front !~ /[\w]$/) {$appendflg=1;}
+				}
+	
+				&pusho($x);
+					
+				if($x =~ /\.$/) { $back =~ s/^\.//; } # only 1 .
+				if($back)
+				{	if($back !~ /^[\w]/) {&appendo($back);}
+					else {&pusho($back);}
+				}
+				next;
+			}
+
+	# eg. Cs, but not As Is Ms Us
+			if(($back =~ /^_s/) && /^[B-HJ-LN-TV-Z]$/)  
+			{	if($front)
+				{	&pusho($front);
+					if($front !~ /[\w]$/) {$appendflg=1;}
+				}
+	
+				&pusho("$_.");
+	
+				if($back)
+				{	if($back !~ /^[\w]/) {&appendo($back);}
+					else {&pusho($back);}
+				}
+				next;
+			}
+
+	# split x.y.
+	$_ .= '.' if $ptbkflg;	# NOTE THIS CHANGES $_ FOR FUTURE MATCHES
+				# but it has no more uses in this loop,
+				# so this _should_ be okay.
+	if (/^([a-zA-Z]\.)+([sS]?)$/)
+	{
+	    $sflag = $2;	# remember if plural (as opposed to a.s.)
+
+	    chop if $ptbkflg;	# trim period that we just added
+
+	    s/\./. /g;		# x.y. -> x. y.
+
+	    s/ ([sS])$/$1/ if $sflag;	# reattach final "s"
+
+	    if($front) 
+	    {	&pusho($front);
+		if($front !~ /[\w]$/) {$appendflg=1;}
+	    }
+	
+	    &pusho($_);
+
+	    if($back)
+	    {	if($back !~ /^[\w]/) {&appendo($back);}
+		else {&pusho($back);}
+	    }
+	    next;
+	}
+
+	# remaining tokens are passed "as is"
+	# [Below does "&pusho($input[$field]);" but faster, since we avoid
+	# the subroutine call for the most common case.]
+	push(@output,$input[$field]);
+    }
+
+    $_=join(" ",@output);
+
+    # if($vflg) {print "ab:\t$_\n";}
+
+    #########################  lettproc  ######################################
+    if (/\b[b-zB-HJ-Z]\b/)
+    {
+	@output = split(/\s+/);
+
+	foreach(@output)
+	{
+	    next unless /^\W*[b-zB-HJ-Z]\W*$/;
+
+	    #if($vflg) {print "le: $_\n";}
+
+	    # some cases to skip/pre-change.  (Note that backslashing of
+	    # quotes is for the sake of Emacs, not Perl.)
+	    next if (/^[\'][nN]$/);		# Spic \'n Span
+
+	    s/(^[\`\'][nN])[\`\']$/$1/ && next;	# Rock 'n' Roll: 'n' -> \'n
+
+	    s/^[\`\'\"]R[\'\`\"]$/"R"/ && next;	# Toys "R" Us
+
+	    next if (/^o\'$/);			# Man o\' War
+
+	    # put . at end of remaining single-letter words
+	    s/^(\W*)([b-zB-HJ-Z])([^.\w]\W*|[^\w.]*)$/$1$2.$3/;
+	}
+	
+	$_=join(" ",@output);
+    }
+
+    s/\s+/ /g;
+    s/^ //;
+    s/ $//;
+
+    s/ _//g;	# attach final s for Cs or AFLs
+    s/_//g;	# clear _
+    s/ - /-/g;
+
+    print $_,"\n" if $_;
+}
+
+sub pusho				# pusho($x): push output
+{	if($appendflg)			# global: used for fronts
+	{
+		&appendo(@_[0]);
+	}
+	else {push(@output,@_);}
+}
+
+sub appendo				# appendo($x): append to output
+{	$appendflg=0;		
+	if($#output < 0) {&perr("appendo: output empty");}
+	$output[$#output] .= @_[0];
+}
+
+sub geto				# geto(): get last output
+{	if($#output < 0) {print STDERR ("geto: output empty\n");}
+	return $output[$#output];
+}
+
+sub perr
+{	print STDERR "abbrevproc: $_[0]\n";
+	exit(1);
+}
+
+sub perr2
+{	print STDERR "abbrevproc: $_[0]\n";
+}
+
+sub setupRoman
+{
+    $Roman{I}="one";
+    $Roman{II}="two";
+    $Roman{III}="three";
+    $Roman{IV}="four";
+    $Roman{V}="five";
+    $Roman{VI}="six";
+    $Roman{VII}="seven";
+    $Roman{VIII}="eight";
+    $Roman{IX}="nine";
+    $Roman{X}="ten";
+    $Roman{XI}="eleven";
+    $Roman{XII}="twelve";
+    $Roman{XIII}="thirteen";
+    $Roman{XIV}="fourteen";
+    $Roman{XV}="fifteen";
+    $Roman{XVI}="sixteen";
+    $Roman{XVII}="seventeen";
+    $Roman{XVIII}="eighteen";
+    $Roman{XIX}="nineteen";
+    $Roman{XX}="twenty";
+    $Roman{XXI}="twenty-one";
+    $Roman{XXII}="twenty-two";
+    $Roman{XXIII}="twenty-three";
+    $Roman{XXIV}="twenty-four";
+    $Roman{XXV}="twenty-five";
+    $Roman{XXVI}="twenty-six";
+    $Roman{XXVII}="twenty-seven";
+    $Roman{XXVIII}="twenty-eight";
+    $Roman{XXIX}="twenty-nine";
+    $Roman{XXX}="thirty";
+    $Roman{XXXI}="thirty-one";
+    $Roman{XXXII}="thirty-two";
+    $Roman{XXXIII}="thirty-three";
+    $Roman{XXXIV}="thirty-four";
+    $Roman{XXXV}="thirty-five";
+}
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms b/egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms
new file mode 100644
index 00000000000..f3dcdddea7b
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms
@@ -0,0 +1,38 @@
+# abbreviation list used for WSJ0 (pilot) processing
+# generated by Doug Paul, MIT/LL
+# derived from unigram file 29 Aug 91 mods to 17 Sept 91
+
+# true abbreviations (must end with .)
+# if key includes lower case, an upper case version will be created
+Adm.	Admiral
+Brig.	Brigadeer
+Capt.	Captain
+Cmdr.	Commander
+Col.	Colonel
+Cpl.	Corporal
+Dr.	Doctor
+Drs.	Doctors
+Fr.	Friar
+Ft.	Fort
+Gen.	General
+Gov.	Governor
+Lt.	Lieutenant
+Maj.	Major
+Mr.	Mister
+Mrs.	Mistress
+Ms.	Miz
+Messrs. Misters
+Prof.	Professor
+Prop.	Proposition
+Pte.	Point
+Pvt.	Private
+Rep.	Representative
+Reps.	Representatives
+Rev.	Reverend
+Sen.	Senator
+Sens.	Senators
+Sgt.	Sargent
+St.	Saint
+Ste.	Saint
+vs.	versus
+v.	versus
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl
new file mode 100755
index 00000000000..ed464e4a31d
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl
@@ -0,0 +1,83 @@
+#!/usr/bin/perl
+
+# artfilter.perl 
+
+# This perl script can be used to (de)select articles from TIPSTER
+# format newswire data on the basis of the content of a specific
+# tagged element.  This version allows a number of string patterns
+# (drawn from a separate input file) to be checked against the content
+# of a chosen tag, and allows residue articles to be sent to a
+# separate file (in addition to having selected articles written to
+# stdout).
+
+require "newgetopt.pl";
+$cmd_okay = &NGetOpt( 't=s', 'p=s', 'f=s', 'r=s', 'v', 'x' );
+$arg_okay = ( $opt_t ne "" && ( $opt_p ne "" || $opt_f ne "" ));
+
+if ( ! $cmd_okay || ! $arg_okay ) {
+    print 
+"\nUsage: artfilter.perl -t tag [-p ptrn | -f ptrns] [-r resid] [-vx] [infile]\n";
+    print "  writes DOCs with <tag> containing /ptrn(s)/ to stdout\n";
+    print "  -v = select DOCs NOT containing /ptrn(s)/ in <tag>\n";
+    print "  -x = exclude DOCs that do not contain <tag>\n";
+    print "  -r = write residue DOCs to resid file\n";
+    exit;
+}
+
+@patrns = ();
+if ( $opt_f ne "" ) {
+    open( PATRNS, "<$opt_f" );
+    while (<PATRNS>) {
+	chop;
+	push( @patrns, $_ );
+    }
+} else {
+    push( @patrns, $opt_p );
+}
+close PATRNS;
+
+if ( $opt_r ) {
+    open( RESID, ">$opt_r" );
+}
+
+$outputOn = $foundtag = 0;
+
+while (<>) 
+{
+    if ( /<DOC[ >]/ ) {
+	$artbuf = $_;
+	$outputOn = 1;
+    }
+    elsif ( /<\/DOC>/ ) {
+	if ( $outputOn ) {
+	    $artbuf .= $_;
+	    if ( $outputOn == 1 && ( ! $opt_x || $foundtag )) {
+		print $artbuf;
+	    } elsif ( $opt_r && ( ! $opt_x || $foundtag )) {
+		print RESID $artbuf;
+	    }
+	    $outputOn = 0;
+	}
+	$foundtag = 0;
+    }
+    elsif ( $outputOn ) {
+	$artbuf .= $_;
+	if ( /\<$opt_t/ ) {
+	    $foundtag = 1;
+	    $tagdata = $_;
+	    while ( $tagdata !~ /\<\/$opt_t/ ) {
+		$_ = <>;
+		$artbuf .= $_;
+		$tagdata .= $_;
+	    }
+	    foreach $ptn ( @patrns ) {
+		last if (( $i = ( $tagdata !~ /$ptn/ )) == 0 );
+	    }
+	    if ( $i ^ $opt_v ) { $outputOn = ( $opt_r ) ?  2 : 0; }
+	}
+    }
+}
+
+if ( $opt_r ) {
+    close RESID;
+}
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl
new file mode 100755
index 00000000000..48acad96c4e
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl
@@ -0,0 +1,69 @@
+#!/usr/bin/perl
+# $Id: bugproc.perl,v 1.4 1996/08/21 23:55:40 robertm Rel $
+###############################################################################
+# This software is being provided to you, the LICENSEE, by the Massachusetts  #
+# Institute of Technology (M.I.T.) under the following license.  By           #
+# obtaining, using and/or copying this software, you agree that you have      #
+# read, understood, and will comply with these terms and conditions:          #
+#                                                                             #
+# Permission to use, copy, modify and distribute, including the right to      #
+# grant others the right to distribute at any tier, this software and its     #
+# documentation for any purpose and without fee or royalty is hereby granted, #
+# provided that you agree to comply with the following copyright notice and   #
+# statements, including the disclaimer, and that the same appear on ALL       #
+# copies of the software and documentation, including modifications that you  #
+# make for internal use or for distribution:                                  #
+#                                                                             #
+# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
+# reserved.                                                                   #
+#                                                                             #
+# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
+# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
+# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
+# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
+# TRADEMARKS OR OTHER RIGHTS.                                                 #
+#                                                                             #
+# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
+# used in advertising or publicity pertaining to distribution of the          #
+# software.  Title to copyright in this software and any associated           #
+# documentation shall at all times remain with M.I.T., and USER agrees to     #
+# preserve same.                                                              #
+###############################################################################
+
+# bugproc.comm
+# Removes some bugs common to all sources.
+# This script has no source-dependencies.
+
+while(<>)
+{
+    if ( /^</ ) {  # pass all tag lines intact;
+	print;
+	next;
+    }
+
+    s/(\w)\(/$1 (/g;			# eg. x( -> x (
+    s/\)(\w)/) $1/g;			# eg. )x -> ) x;
+
+    s/(\d)\((\d)/$1 ($2/g;			# \d(\d
+    s/(\d)\)(\d)/$1) $2/g;			# \d)\d;
+    s/([a-zA-Z]{2,}\.)(\d)/$1 $2/g;		# eg. Sept.30
+    s/,([a-zA-Z])/, $1/g;			# eg. 20,Smith
+    s/(\W)milion(\W)/$1million$2/g;		# spelling err
+
+    s/(\W&\s*)Co([^\w\.-])/$1Co.$2/g;		# "& Co" -> "& Co."
+    s/(\WU\.S)([^\.\w])/$1.$2/g;		# U.S -> U.S.
+
+    # next block added for Broadcast News archive processing
+    s/\$ +(\d)/\$$1/g;		# e.g. "$ 5" -> "$5"
+    s/\$\#/\$/g;		# e.g. "$#5" -> "$5" (typo??)
+    s/\#/number /g;		# in bc-news, "#" = "number" not "pound"
+    s=([^\s</])(/+)\s=$1 $2 =g;	# e.g. "2002/ " -> "2002 / "
+    s=([0-9])/1,000([^0-9,])=$1/1000$2=g; # e.g. "1/1,000" -> "1/1000"
+
+    s/\s{2,}/ /g;
+    s/^ //;
+    s/\s*$/ \n/;
+
+    print;
+}
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm b/egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm
new file mode 100755
index 00000000000..6a4f66eef4e
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm
@@ -0,0 +1,43 @@
+#!/bin/sh
+# $Id: do-lm,v 1.3 1996/08/23 22:43:23 robertm Rel $
+Usage()
+{
+cat << EOM 1>&2
+Usage: $0 file(s)
+  Runs LM pipeline on FILES, with output to "lm" subdirectory of cwd.
+  Expects to find LM conditioning tools in PATH or ./bin.
+EOM
+}
+
+# Excludes "fixvp" stage which has the main effect of killing off
+# any SGML tagging that contains a space, e.g. <p id=...>.
+
+# BBN used -np switch for puncproc, removing punctuation; this chooses the
+# "verbalize" option instead.
+
+# Includes new "numhack" module to deal with zip codes and phone numbers.
+
+if [ $# -eq 0 ] || [ $1 = "-h" ]; then
+	Usage
+	exit 1
+fi
+
+dir=$1
+shift
+
+for file in $*
+do
+	BASENM=`basename $file`
+  name="${BASENM%.*}"
+
+	echo "Running LM pipeline for |$BASENM|..." 1>&2
+	set -x
+  gunzip -c $file | pare-sgml.perl | \
+    bugproc.perl | \
+    numhack.perl | \
+    numproc.perl -xlocal/data_prep/csr_hub4_utils/num_excp | \
+    abbrproc.perl local/data_prep/csr_hub4_utils/abbrlist | \
+    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
+	set +x
+	echo "Done with $BASENM."
+done
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns b/egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
new file mode 100644
index 00000000000..d6e34eb7357
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
@@ -0,0 +1,4 @@
+ABCPrimetime Live
+CNNMorning News
+CNNWorld View
+NPRMorning Edition
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp b/egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp
new file mode 100644
index 00000000000..0f93e6ae51c
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp
@@ -0,0 +1,528 @@
+###############################################################################
+# This software is being provided to you, the LICENSEE, by the Massachusetts  #
+# Institute of Technology (M.I.T.) under the following license.  By           #
+# obtaining, using and/or copying this software, you agree that you have      #
+# read, understood, and will comply with these terms and conditions:          #
+#                                                                             #
+# Permission to use, copy, modify and distribute, including the right to      #
+# grant others the right to distribute at any tier, this software and its     #
+# documentation for any purpose and without fee or royalty is hereby granted, #
+# provided that you agree to comply with the following copyright notice and   #
+# statements, including the disclaimer, and that the same appear on ALL       #
+# copies of the software and documentation, including modifications that you  #
+# make for internal use or for distribution:                                  #
+#                                                                             #
+# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
+# reserved.                                                                   #
+#                                                                             #
+# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
+# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
+# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
+# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
+# TRADEMARKS OR OTHER RIGHTS.                                                 #
+#                                                                             #
+# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
+# used in advertising or publicity pertaining to distribution of the          #
+# software.  Title to copyright in this software and any associated           #
+# documentation shall at all times remain with M.I.T., and USER agrees to     #
+# preserve same.                                                              #
+###############################################################################
+
+# exceptions: list of numbers to be expanded in exceptional ways
+# derived by manual scan of early unigram file
+# executed BEFORE numproc
+#
+# comments indicated by "#" in the first column
+
+# years
+'20s	twenties
+'30s	thirties
+'40s	forties
+'50s	fifties
+'60s	sixties
+'70s	seventies
+'80s	eighties
+'90s	nineties
+
+# processors
+8086	eighty eighty-six
+186	one eighty-six
+286	two eighty-six
+386	three eighty-six
+486	four eight-six
+187	one eighty-seven
+287	two eighty-seven
+387	three eighty-seven
+80286	eighty two eighty-six
+80386	eighty three eighty-six
+80486	eighty four eighty-six
+3090	thirty ninety
+68020	sixty-eight oh twenty
+68030	sixty-eight oh thirty
+
+# aircraft
+707	seven oh seven
+707s	seven oh sevens
+707's	seven oh seven's
+727	seven twenty-seven
+727s	seven twenty-sevens
+727's	seven twenty-seven's
+737	seven thirty-seven
+737s	seven thirty-sevens
+737-100	seven thirty-seven -- one hundred
+737-100s	seven thirty-seven -- one hundreds
+737-200	seven thirty-seven -- two hundred
+737-200s	seven thirty-seven -- two hundreds
+737-205	seven thirty-seven -- two oh five
+737-300	seven thirty-seven -- three hundred
+737-300s	seven thirty-seven -- three hundreds
+737-400	seven thirty-seven -- four hundred
+737-400s	seven thirty-seven -- four hundreds
+737-500	seven thirty-seven -- five hundred
+737-500s	seven thirty-seven -- five hundreds
+737-500's	seven thirty-seven -- five hundred's
+747	seven forty-seven
+747s	seven forty-sevens
+747's	seven forty-seven's
+747F	seven forty-seven F.
+747-100	seven forty-seven -- one hundred
+747-100s	seven forty-seven -- one hundreds
+747-124SF	seven forty-seven -- one twenty four S. F.
+747-200	seven forty-seven -- two hundred
+747-200s	seven forty-seven -- two hundreds
+747-200's	seven forty-seven -- two hundred's
+747-200B	seven forty-seven -- two hundred B.
+747-200F	seven forty-seven -- two hundred F.
+747-273	seven forty-seven -- two seventy-three
+747-300	seven forty-seven -- three hundred
+747-341B	seven forty-seven -- three forty-one B.
+747-400	seven forty-seven -- four hundred
+747-400s	seven forty-seven -- four hundreds
+747-500	seven forty-seven -- five hundred
+747-500s	seven forty-seven -- five hundreds
+747-500's	seven forty-seven -- five hundred's
+757	seven fifty-seven
+757s	seven fifty-sevens
+757's	seven fifty-seven's
+757-200	seven fifty-seven -- two hundred
+757-200s	seven fifty-seven -- two hundreds
+757-225	seven fifty-seven -- two two five
+757-232s	seven fifty-seven -- two three twos
+757-767	seven fifty-seven - seven sixty-seven
+767	seven sixty-seven
+767s	seven sixty-sevens
+767-200	seven sixty-seven -- two hundred
+767-200s	seven sixty-seven -- two hundreds
+767-200ER	seven sixty-seven -- two hundred E R
+767-300	seven sixty-seven -- three hundred
+767-300s	seven sixty-seven -- three hundreds
+767-300ER	seven sixty-seven -- three hundred E R
+767-300-ER	seven sixty-seven -- three hundred E R
+
+A310	A. three ten
+A320	A. three twenty
+A330	A. three thirty
+A340	A. three forty
+A-310	A. three ten
+A-320	A. three twenty
+A-330	A. three thirty
+A-340	A. three forty
+A310s	A. three tens
+A320s	A. three twenties
+A330s	A. three thirties
+A340s	A. three forties
+A-310s	A. three tens
+A-320s	A. three twenties
+A-330s	A. three thirties
+A-340s	A. three forties
+
+1011	ten eleven
+1011s	ten elevens
+
+MD-80	M. D. eighty
+
+# misc
+#8mm	eight millimeter
+#35mm	35 millimeter
+gp120	g. p. one-twenty
+240SX	two forty S. X.
+RU486	R. U. four eighty-six
+RU-486	R. U. four eighty-six
+
+3Com	three Com
+3COM	three COM
+3Com's	three Com's
+3COM's	three COM's
+
+# serial number mode words
+# marked by initial * (stripped in numproc)
+
+*year
+*VAX
+*Up
+*mm
+*ish
+*point
+*May
+*Station
+*inch
+*ers
+*and
+*mark
+*sec
+*stock
+*mid
+*pre
+*dBase
+*Co
+
+# right contexts for dollar
+$accord
+$account
+$acquisition
+$ad
+$addition
+$additional
+$advance
+$agreement
+$aid
+$Air
+$airport
+$allowance
+$amount
+$annual
+$appropriation
+#	"apartment" and "apartments" should be fixed, but would alter v1.0
+#$apartment
+#$apartments
+$area
+$arms
+$Army
+$arrangement
+$asset
+$Atari
+$auction
+$average
+$award
+$backlog
+$bailout
+$balance
+$bank
+$bankroll
+$barrier
+$base
+$based
+$benchmark
+$bid
+$bill
+$bills
+$bond
+$bonds
+$bonus
+$book
+$bridge
+$budget
+$building
+$Burger
+$business
+$buyout
+$campaign
+$cap
+$capital
+$car
+$ceiling
+$charge
+$check
+$checks
+$claim
+$Clean
+$coffeepot
+$coffeepots
+$company
+$companies
+$compensation
+$complex
+$computer
+$consortium
+$construction
+$consulting
+$contract
+$contracts
+$contribution
+$contributions
+$convertible
+$cost
+$costs
+$court
+$credit
+$cumulative
+$cut
+$deal
+$debenture
+$debentures
+$debt
+$decline
+$decrease
+$deductible
+$default
+$Defense
+$defense
+$defensive
+$deficit
+$denominations
+$deposit
+$development
+$difference
+$disallowance
+$Distillers
+$dividend
+$domestic
+$donor
+$donors
+$drop
+$effort
+$emergency
+$endowment
+$energy
+$equity
+$estate
+$estimate
+$Eurobond
+$exemption
+$expansion
+$expense
+$face
+$facility
+$fare
+$federal
+$fee
+$fence
+$Fidelity
+$figure
+$financing
+$fine
+$fines
+$First
+$foreign
+$FSLIC
+$fund
+$funds
+$gain
+$gains
+$gap
+$goal
+$gold
+$grant
+$guarantee
+$hammer
+$hammers
+$highway
+$home
+$Hong
+$hostile
+$house
+$income
+$increase
+$industry
+$infusion
+$initial
+$installment
+$investment
+$issue
+$issues
+$judgment
+$junk
+$Kansai
+$laboratory
+$lawsuit
+$LBO
+$legal
+$letter
+$level
+$leveraged
+$liability
+$limit
+$line
+$litigation
+$loan
+$loans
+$loss
+$machine
+$mark
+$market
+$maximum
+$measure
+$merger
+$Midland
+$minimum
+$mortgage
+$Navy
+$net
+$note
+$notes
+$obligation
+$obligations
+$offer
+$offering
+$offerings
+$office
+$order
+$outlay
+$package
+$pact
+$payout
+$payment
+$payments
+$penalty
+$Pennzoil
+$pension
+$Pentagon
+$pipeline
+$plan
+$plant
+$portion
+$premium
+$price
+$principal
+$prize
+$proceeds
+$production
+$profit
+$program
+$project
+$proposal
+$provision
+$purchase
+$purse
+$Putnam
+$question
+$range
+$rate
+$reactor
+$reactors
+$rebate
+$rebates
+$recapitalization
+$record
+$redemption
+$reduction
+$refund
+$renovation
+$request
+$rescue
+$research
+$reserve
+$restructuring
+$retirement
+$revolving
+$rise
+$River
+$salary
+$sale
+$sales
+$Saturn
+$savings
+$series
+$settlement
+$share
+$shelf
+$shortage
+$shortfall
+$software
+$spacecraft
+$special
+$stake
+$station
+$stock
+$study
+$suit
+$suits
+$sum
+$surge
+$surplus
+$system
+$tab
+$takeover
+$takeover
+$target
+$tax
+$Templeton
+$tender
+$threshold
+$toilet
+$total
+$trade
+$transaction
+$trigger
+$trust
+$value
+$venture
+$verdict
+$vessel
+$Waterford
+$windfall
+$wine
+$Winsor
+$world
+$World
+
+# skip before right context for dollar
+$$advertising
+$$asking
+$$civil
+$$closing
+$$commercial
+$$common
+$$compensatory
+$$Contra
+$$corporate
+$$damage
+$$economic
+$$energy
+$$European
+$$first
+$$general
+$$global
+$$government
+$$housing
+$$insurance
+$$interest
+$$interim
+$$international
+$$junior
+$$libel
+$$marketing
+$$municipal
+$$nationwide
+$$new
+$$nuclear
+$$omnibus
+$$outstanding
+$$personal
+$$pretax
+$$private
+$$projected
+$$proposed
+$$public
+$$punitive
+$$real
+$$retail
+$$refunding
+$$refinancing
+$$retirement
+$$revenue
+$$second
+$$secured
+$$security
+$$semi-annual
+$$senior
+$$space
+$$state
+$$State
+$$stated
+$$taxable
+$$term
+$$testing
+$$thrift
+$$trading
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl
new file mode 100755
index 00000000000..be8e611a2b0
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl
@@ -0,0 +1,80 @@
+#!/usr/bin/perl
+
+# $Id: numhack.perl,v 1.4 1996/08/23 05:12:27 robertm Rel $
+# preprocessor for numproc, potentially specialized for Broadcast News material
+
+# tries to patch numproc's problems with:
+#	- telephone numbers
+#	- zip codes
+# for example:
+#   1-800-555-1212
+#     =>  one - eight hundred -  five five five -  one two one two
+#   (215) 555-1212
+#     =>  two one five -  five five five -  one two one two
+#   212/285-9400
+#     =>  two one two -  two eight five -  nine four zero zero
+#   1-(800)-CAR-CASH
+#     =>  one - eight hundred -CAR-CASH
+#   New York, NY 10007
+#     =>  New York, NY  one zero zero zero seven
+#   Philadelphia, PA 19104-6789
+#     =>  Philadelphia, PA  one nine one oh four -  six seven eight nine
+
+# may leave behind extra spaces here and there, but later processes ought
+# to correct that...
+
+@ones_oh=("oh","one","two","three","four",
+	  "five","six","seven","eight","nine");
+
+while(<>)
+{
+    next unless /\d/;		# skip lines without numbers
+    next if /^<\/?[aps]/;	# skip SGML
+
+    # probable Zip codes
+    s/\b(\d{5}-\d{4})\b/&SpellDigits($1)/eg;	# 12345
+    s/\b(\d{5})\b/&SpellDigits($1)/eg;		# 12345-6789
+
+    # phone numbers
+    s=(^| )([1l][- ])?\(?([2-9]\d{2})\)?[-/]? ?(\d{3})-(\d{4})\b=&SpellTel($2,$3,$4,$5)=eg; # 215-555-1212 etc.
+    s/(^| )(\d{3}-\d{4})\b/&SpellDigits($2)/eg;	# 555-1212
+    s/\b1-\(?800\)?(\W)/ one - eight hundred $1/g;	# isolated 1-800
+    s/([Aa]rea code) (\d{3})(\W)/"$1 ".&SpellDigits($2)."$3"/eg;
+
+} continue {
+    print;
+}
+
+exit;
+
+sub SpellDigits
+{
+    local($num)=$_[0];
+    $num =~ s/(\d)(\D)(\d)/$1 $2 $3/g; # add space around non-digits
+    # isolated zeros become "oh", string of them become "zero ..."
+    $num =~ s/(00+)/" zero" x length($1)/eg;
+    $num =~ s/(\d)/" $ones_oh[$1]"/eg;
+    return $num;
+}
+
+sub SpellTel
+{
+    local($pre,$area,$exch,$rest)=@_;
+    $return = $pre ? " one -" : " ";
+    if ($area =~ /(\d)00/)
+    {
+	$return .= &SpellDigits($1);
+	$return .= " hundred";
+    }
+    else
+    {
+	$return .= &SpellDigits($area);
+    }
+    $return .= " - ";
+
+    $return .= &SpellDigits($exch);
+    $return .= " - ";
+    $return .= &SpellDigits($rest);
+
+    return $return;
+}
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl
new file mode 100755
index 00000000000..e97d3ae51dd
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl
@@ -0,0 +1,1134 @@
+#! /usr/bin/perl
+#
+# $Id: numproc.perl,v 1.7 1996/08/23 05:04:11 robertm Rel $
+###############################################################################
+# This software is being provided to you, the LICENSEE, by the Massachusetts  #
+# Institute of Technology (M.I.T.) under the following license.  By           #
+# obtaining, using and/or copying this software, you agree that you have      #
+# read, understood, and will comply with these terms and conditions:          #
+#                                                                             #
+# Permission to use, copy, modify and distribute, including the right to      #
+# grant others the right to distribute at any tier, this software and its     #
+# documentation for any purpose and without fee or royalty is hereby granted, #
+# provided that you agree to comply with the following copyright notice and   #
+# statements, including the disclaimer, and that the same appear on ALL       #
+# copies of the software and documentation, including modifications that you  #
+# make for internal use or for distribution:                                  #
+#                                                                             #
+# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
+# reserved.                                                                   #
+#                                                                             #
+# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
+# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
+# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
+# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
+# TRADEMARKS OR OTHER RIGHTS.                                                 #
+#                                                                             #
+# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
+# used in advertising or publicity pertaining to distribution of the          #
+# software.  Title to copyright in this software and any associated           #
+# documentation shall at all times remain with M.I.T., and USER agrees to     #
+# preserve same.                                                              #
+###############################################################################
+
+# preprocessor for WSJ
+# assumes 1 sentence per line
+#
+# 1.  expand numerical exceptions: eg. 386
+# 2.  do regular numerical expansions
+
+# Minor modifications by David Graff, Linguistic Data Consortium, in preparation
+# for publishing on cdrom;  Aug. 11, 1994.
+
+$POINT='.POINT';		# orthographic notation for .
+
+	# final s in name indicates plural version, otherwise just add s
+@ones_z=("zero","one","two","three","four",
+	"five","six","seven","eight","nine");
+@ones_oh=("oh","one","two","three","four",
+	"five","six","seven","eight","nine");
+@ten=("","ten","twenty","thirty","forty","fifty",
+	"sixty","seventy","eighty","ninety");
+@teen=("ten","eleven","twelve","thirteen","fourteen","fifteen",
+	"sixteen","seventeen","eighteen","nineteen");
+@mult=("","thousand","million","billion","trillion"
+	,"quadrillion","quintillion","sextillion","septillion","octillion");
+@den=("","","half","third","quarter","fifth",
+	"sixth","seventh","eighth","ninth","tenth",
+	"eleventh","twelfth","thirteenth","fourteenth","fifteenth",
+	"sixteenth","seventeenth","eighteenth","nineteenth");
+@largeden=("","first","second","third","fourth","fifth",
+	"sixth","seventh","eighth","ninth","tenth",
+	"eleventh","twelfth","thirteenth","fourteenth","fifteenth",
+	"sixteenth","seventeenth","eighteenth","nineteenth");
+@ordnal=("","first","second","third","fourth","fifth",
+	"sixth","seventh","eighth","ninth","tenth",
+	"eleventh","twelfth","thirteenth","fourteenth","fifteenth","sixteenth");
+@months=("Jan.","Feb.","Mar.","Apr.","Jun.","Jul.","Aug.","Sept.","Oct.",
+	"Nov.","Dec.","January","February","March","April","May","June",
+	"July","August","September","October","November","December");
+
+$exfile="$ENV{HOME}/bc-news/bin/num_excp";		# default exceptions file name
+
+for($i=0,$j=0;$i<=$#ARGV;$i++)
+{	if($ARGV[$i] =~ /^-/)
+	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
+		elsif($ARGV[$i] =~ /^-x/)
+		{	$exfile=$ARGV[$i];
+			$exfile =~ s/^-x//;
+		}
+		else {&perr2("illegal flag: $ARGV[$i]"); }
+	}
+	else { &perr2("no file args"); }
+}
+@ARGV=();
+
+if(!exfile) {&perr2("no exceptions file specified"); }
+
+if(!open(EXFILE,$exfile)) {&perr2("cannot open $exfile"); }
+while(<EXFILE>)
+{	if(/^#/) {next;}	# comment
+	s/\n//;
+	if(!$_) {next;}		# blank
+	$y=$_;
+	s/^(\S+)\s*//;		# extract 1st word
+	$x=$1;
+	if($x eq "") {&perr2("$exfile: no word: $y");}
+	if($x =~ /^\$\$/)		# $$word => skip
+	{	$x =~ s/^\$*//;
+		$sing_dollar{$x}=2;
+	}
+	elsif($x =~ /^\$/)		# $word => singular right context
+	{	$x =~ s/^\$*//;
+		$sing_dollar{$x}=1;
+	}
+	elsif($x =~ /^\*/)
+	{	$x =~ s/\**//g;
+		if(!$x) {&perr2("$exfile: no serno word");}
+		$sernowd{$x}=1;		# serial no words
+	}
+	else
+	{	if($x !~ /\d/) {&perr2("$exfile: non-numerical key");}
+		if(!$_) {&perr2("$exfile: no value");}
+
+		$except{$x}=$_;		# translations
+	}
+	$n++;
+}
+close(EXFILE);
+if($vflg) {print STDERR "$n lines read from exceptions file\n";}
+
+for($i=0;$i<=$#months;$i++)	# make months hash
+{	$_=$months[$i];
+	$months{$_}=1;		# mixed case
+	tr/a-z/A-Z/;
+	$months{$_}=1;		# UC
+}
+
+while(<>)
+{	# removed local($front.$back,$x) to conserve memory RWM 8/96
+
+##############################  exceptproc  ##################################
+	s/^\s*//;
+	s/\n//o;
+	if($vflg) {print "input:\t$_\n";}
+	if(/\d/ && !/^<\/?[spa]/)		# opt and protect sgml
+	{	@input = split(/\s+/o);
+		@output=();
+		for($field=0;$field<=$#input;$field++)	# $field is global
+		{	$_=$input[$field];
+	
+			if(!/\d/)			# only processes numbers
+			{	&pusho($input[$field]);		# not processed
+				next;
+			}
+	
+			s/^(\W*)//o;		# strip front
+			$front=$1;
+			if($front =~ /\$$/ || $front =~ /#$/)	# protect money
+			{	&pusho($input[$field]);		# not processed
+				next;
+			}
+	
+			s/(\W*)$//o;		# strip back
+			$back=$1;
+	
+			if($front =~ /\'$/ && $except{"'$_"})	# eg "'20s"
+			{	$front =~ s/\'$//;
+				if($front) 
+				{	&pusho($front);
+					if($front !~ /[\w]$/o) {$appendflg=1;}
+				}
+	
+				&pusho($except{"'$_"});		# translation
+					
+				if($back)
+				{	if($back !~ /^[\w]/o) {&appendo($back);}
+					else {&pusho($back);}
+				}
+			}
+			elsif($except{$_})
+			{	if($front) 
+				{	&pusho($front);
+					if($front !~ /[\w]$/o) {$appendflg=1;}
+				}
+	
+				&pusho($except{$_});		# translation
+					
+				if($back)
+				{	if($back !~ /^[\w]/o) {&appendo($back);}
+					else {&pusho($back);}
+				}
+			}
+			else {&pusho($input[$field]);}		# not processed
+		}
+		$_=join(" ",@output);
+	}
+	s/\s+/ /g;
+	s/^ //o;
+	s/ $//o;
+	if($vflg) {print "ex:\t$_\n";}
+
+############################  numproc  ########################################
+	if(!/^<\/?[spa]/)			# protect sgml, also art
+	{	s/(\d+)-(\d+)-(\d+)/$1 - $2 - $3/g;	# eg. 1-2-3
+		s/(\d+)x(\d+)/$1 by $2/g;		# eg. 2x4
+		s/(\d+)\+(\d+)/$1 plus $2/g;		# eg. 2+2
+		s=(\d)-(\d)[/\\](\d)=$1 $2/$3=g;	# e.g. 3-1/2
+		s=(\d)\\(\d)=$1/$2=g;			# e.g. 1\2 for 1/2
+		s/\$(\d[\d,]*)-\$(\d)/$1 to \$$2/g;	# $ range: eg. $1-$2
+		s/\$(\d[\d,]*)-(\d)/$1 to \$$2/g;	# $ range: eg. $1-2
+		s/(\d)-(\'?)(\d)/$1 to $2$3/g;		# range: eg. 1-2
+		s/%-(\d)/% to $1/g;			# % range: eg. 1%-2%
+		s/(\d)=(\d)/$1 equals $2/g;		# equation: x=y
+		s/ - / -- /g;				# recode dashes
+		s/([^-\d\s])-([^-\d\s])/$1 - $2/g;	# split in-word hyphens
+		s/- +-/--/g; s/- +-/--/g;		# close dashes
+		s/-{3,}/--/g;				# map dashes to --
+		s/--/ -- /g;				# space around --
+		s/(\d) +(\d+\/\d)/$1 and $2/g;	      # dig frac -> dig and frac
+		s/([a-zA-Z])\//$1 \/ /g;		# text/*
+		s/\/([a-zA-Z])/ \/ $1/g;		# */text
+
+		s/([a-zA-Z]\d+)\/(\d+)/$1 \/ $2/g;	# eg. a1/3 -> a1 / 3
+		s/(\/\d*)th/$1/ig;			# eg. 1/10th -> 1/10
+		s/(\/\d*1)st/$1/ig;			# eg. 1/x1st -> 1/x1
+		s/(\/\d*2)nd/$1/ig;			# eg. 1/x2nd -> 1/x2
+		s/(\/\d*3)rd/$1/ig;			# eg. 1/x3rd -> 1/x3
+		s/(\d+)\/(\d+[a-zA-Z])/$1 \/ $2/g;	# eg. 1/3a -> 1 / 3a
+		s/([a-zA-Z])-(19\d\d\D)/$1 - $2/g;  # eg. mid-1990 -> mid - 1990
+#		s/([a-zA-Z])-(\d)/$1 $2/g;		# eg. a-1 -> a 1
+#		s/(\d)-([a-zA-Z])/$1 $2/g;		# eg. 1-a -> 1 a
+		s/([a-zA-Z])-(\d)/$1 - $2/g;		# eg. a-1 -> a - 1
+		s/(\d)-([a-zA-Z])/$1 - $2/g;		# eg. 1-a -> 1 - a
+	
+		# fix common time typo (; for :)
+		s/\b([012]?\d);([0-5]\d)\b/$1:$2/g;	# e.g. 11;00 -> 11:00
+
+		if(!/\d:\d\d$/o && !/\d:\d\d\D/o)    # preprocess non-time \d:\d
+		{	s/(\d):(\d)/$1 : $2/g;
+			s/(\S):(\d)/$1: $2/g;
+		}
+	}
+
+	if($vflg) {print "num1:\t$_\n";}
+
+	s/^\s*//;
+	if(/\d/ && !/^<\/?[spa]/)		# opt and protect sgml
+	{	@input = split(/\s+/o);
+		@output=();
+	for($field=0;$field<=$#input;$field++)	# $field is global
+		{	if($field>0) {$last=$input[$field-1];}
+			else {$last='';}
+			if($field<$#input) {$next=$input[$field+1];}
+			else {$next='';}
+			if($field<$#input-1) {$next2=$input[$field+2];}
+			else {$next2='';}
+			$this=$input[$field];
+			$_=$input[$field];
+	
+			if(/<[\w\.\/]*>/o && !/<p/o && !/<\/p>/o) # pass only
+				{&perr("spurious SGML: $_"); next; }	# <p... and </p>
+	
+			if(/[0-9]/o && !/<p/o)		# number but not <p
+			{	if(/[\$\#]/o)			# money
+					{if (! &money($_,$next)) {next;} }
+				elsif(/\d:\d\d$/o || /\d:\d\d\D/o)	# time
+					{if (! &printtime($_)) {next;} }
+				elsif(/\d+\/\d+\/\d+/o)		# x/x/x date
+					{if (! &printdate($_)) {next;} }
+				elsif((/[a-zA-Z].*\d/ || /\d.*[a-zA-Z]/)
+				      && 
+				      !(/\dth\W*/i || /1st\W*/i || /2nd\W*/i
+					|| /3rd\W*/i
+					|| (/\d\'?s\W*/
+					    && (! /\d[a-zA-Z]+\d+\'?s\W*$/))))
+					{if (! &printserno($_)) {next;} }	 # serial no
+				elsif(/\//o)			# fraction
+					{if (! &printfrac($_)) {next;} }
+				elsif(/\d\'-?\d+/o)		# ft inches
+					{if (! &printftin($_)) {next;} }
+				else {if (! &printnum($_)) {next;} }	      # ordinary number
+			}
+			else {&pusho($_ );}		# non-numeric string
+		}
+		$_=join(" ",@output);
+	}
+	s/^/ /o;
+	s/$/ /o;
+	s/ - /-/g;		# unspace hyphen
+	s/%/ % /g;
+	s/ {2,}/ /g;
+	s/^ //o;
+	s/ $//o;
+
+	if($_) {print "$_\n";}
+}
+
+sub money				# money($this,$next)
+{	$_=$_[0];		# $this
+	local($next)=$_[1];
+	if($vflg) {print "money: $_, $next\n";}
+
+	local($unit);
+	local($subunit_sing);
+	local($subunit_pl);
+	local($punct);
+	local($plural);
+	local($sing);
+	local($frac);
+	local($front);
+	local($back);
+	local($x);
+	local($y);
+	local($z);
+	local($i);
+	local($j);
+
+	s/\$\.(\d)/\$0.$1/g;	# patch numbers like $.22
+	if(/A\$/)				# $ stuff
+	{	($front)=/^(.*)A\$/;
+		s/A\$//;
+		$unit='Australian dollar';
+		$subunit_sing='cent';
+		$subunit_pl='cents';
+	}
+	elsif(/C\$/)
+	{	($front)=/^(.*)C\$/;
+		s/C\$//;
+		$unit='Canadian dollar';
+		$subunit_sing='cent';
+		$subunit_pl='cents';
+	}
+	elsif(/NZ\$/)
+	{	($front)=/^(.*)NZ\$/;
+		s/NZ\$//;
+		$unit='New Zealand dollar';
+		$subunit_sing='cent';
+		$subunit_pl='cents';
+	}
+	elsif(/US\$/)
+	{	($front)=/^(.*)US\$/;
+		s/US\$//;
+		$unit='U S dollar';
+		$subunit_sing='cent';
+		$subunit_pl='cents';
+	}
+	elsif(/\$/)
+	{	($front)=/^(.*)\$/;
+		s/\$//;
+		$unit='dollar';
+		$subunit_sing='cent';
+		$subunit_pl='cents';
+	}
+	elsif(/#/)				# pound
+	{	($front)=/^(.*)#/;
+		s/#//;
+		$unit='pound';
+		$subunit_sing='penny';
+		$subunit_pl='pence';
+	}
+	else {&perr("money: unknown currency"); return 0;}
+
+	($back)=/(\D*)$/;
+	$back =~ s/^s//;	# $40s -> $40
+
+	if($front) 
+	{	&pusho($front);			# generally punctuation
+		if($front !~ /\w$/) {$appendflg=1;}
+	}
+
+	$x=$_;
+	if($x =~ /\//)
+	{	$x =~ s/^\D*//;
+		$x =~ s/\D*$//;
+		if (! &printfrac($x)) {return 0;}
+		&pusho("of a $unit");
+		$x="";
+		$plural=0;
+	}
+
+	$x =~ s/^\D*([\d,]*)\D*.*$/$1/;		# int part of string
+	if($x ne "") {if (! &printint($x)) {return 0;} }		# print int part (eg. dollars)
+
+	if($next eq "and" && $next2 =~ /\d\/\d/ && next2 !~ /\/.*\//)
+	{	if($unit && $x ne "") {&pusho("and");}	      # frac: eg 4 1/16
+		$z=$next2;
+		$z =~ s/\D*$//;
+		if (! &printfrac($z)) {return 0;}
+		($punct)=($next2 =~ /(\D*)$/);
+		$field+=2;
+		&pusho("${unit}s");
+	
+		if($back) {&perr("money: back and 1 1/3"); return 0;}
+		
+		if($punct) {&appendo($punct);}	# punctuation from *illion
+		return 1;
+	}
+
+	if($back eq "" && $next =~ /^(thousands?|[a-z]*illions?)(\W*)/i)
+	{	if (! &printdecfrac($_)) {return 0;}			# multiplier
+		&pusho($1);
+		$punct=$2;
+		$plural=1;			### if adj '', if noun 's'
+		$field++;
+		$frac=1;
+	}
+	elsif(/\.\d$/ || /\.\d\D/ || /\.\d{3}/ )	# .d or .ddd+
+	{	if (! &printdecfrac($_)) {return 0;}
+		$plural=1;			# can be either
+		$frac=1;
+	}
+	else
+	{	$y=$x;
+		$y =~ s/,//g;			# remove commas
+		if(int($y)!=1) {$plural=1;}
+	}
+
+	if($back eq "" && $input[$field+1] =~ /dollar/i)
+	{	$unit="";			# fix "$1 dollar" wsj typo
+		$subunit_sing="";
+		$subunit_pl="";
+		if (! &printdecfrac($_)) {return 0;}
+		$frac=1;
+	}
+
+#print "f=$front, m=$_, b=$back\n";
+#foo
+	$sing=0;
+	if($last =~ /^\W*[aA][nN]?\W*$/) {$sing=1;}	# a $123, an $80
+	elsif($input[$field+1] eq "-") {$sing=1;}	# eg. $123-a-day
+							# next one is chancy
+	elsif($input[$field] !~ /\W$/ && $input[$field+1] !~ /^\W/ &&
+		$input[$field+1] =~ /[a-zA-Z]$/ && $input[$field+2] eq "-" &&
+		$input[$field+3] =~ /^[a-zA-Z]/) {$sing=1;}	# $ after-tax
+
+	elsif($back eq "" && !$punct) # right contexts with no intervening punct
+	{	$j=$field+1;		# includes *ly as a skip
+		$z="";
+		for($i=0;$i<2;$i++,$j++)	# skip ?
+		{	$y=$input[$j];			# strip final punct
+			$y =~ s/\W*$//;
+			if($y !~ /\w*ly$/i && $sing_dollar{$y}!=2) {last;}
+			($y)=($input[$j] =~ /(\W*)$/);	# get final punct
+			$z .= $y;			# accumulate
+		}
+		$y=$input[$j];			# strip final punct
+		$y =~ s/\W*$//;
+		if($z eq "" && $sing_dollar{$y}==1) {$sing=1;}
+	}
+		
+	if($unit)					# print unit
+	{	&pusho($unit);
+		if($plural && !$sing) {&appendo("s");}	# just add s for plural
+	}
+
+	if(!$frac && /\.\d{2}/)			# .dd	(eg. cents)
+	{	$y=$_;
+		$y =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/;	# get fractional part
+		if($unit && $x ne "") {&pusho("and");}
+		if (! &printint($y)) {return 0;}
+		if($sing || int($y)==1) {&pusho($subunit_sing);}
+		else {&pusho($subunit_pl);}
+	}
+
+	if($back)				# punctuation from this field
+	{	if($punct) {&perr("money: back and punct"); return 0;}
+
+		if($back =~ /^\w/) {&pusho($back);}
+		else {&appendo($back);}
+	}
+		
+	if($punct) {&appendo($punct);}		# punctuation from *illion
+
+  return 1;
+}
+
+sub printyear			# &printyear(x)
+{	if($vflg) {print "printyear: $_[0]\n";}
+	return &printnum($_[0]);		# for now
+}
+
+sub printtime			# &printtime(x)
+{	if($vflg) {print "printtime: $_[0]\n";}
+	$_=$_[0];
+	
+	local(@x);
+	local($front);
+	local($back);
+
+	if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time"); return 0;}
+
+	@x=split(/:/,$_);
+	($front)=($x[0] =~ /^(\D*)/);
+	$x[0] =~ s/^(\D*)//;
+	($back)=($x[1] =~ /(\D*)$/);
+	$x[1] =~ s/(\D*)$//;
+	
+	if($front) 
+	{	&pusho($front);			# generally punctuation
+		if($front !~ /\w$/) {$appendflg=1;}
+	}
+	if (! &printint($x[0])) {return 0;}
+	if($x[1]==0)
+	{	$_=$next;
+		if(!/^[aApP]\.?[nM]\.?$/) {&pusho("o'clock");}
+	}
+	elsif ($x[1]<10)
+	{	&pusho("oh");
+		if (!&printint($x[1])) {return 0;}
+	}
+	else {if (! &printint($x[1])) {return 0;} }
+	if($back)
+	{	if($back =~ /^\w/) {&pusho($back);}
+		else {&appendo($back);}		# generally punctuation
+	}
+  return 1;
+}
+
+sub printfrac
+{	if($vflg) {print "printfrac: $_[0]\n";}
+	local($x)=$_[0];
+
+	local(@z);			#Perl BUG: lists do not seem to be local
+	local($sign);
+	local($front);
+	local($back);
+	local($sign);
+
+	$x =~ s/^([^\d\.]*)//;		# strip front
+	$front=$1;
+	if($front =~ /^\+$/)		# get sign
+	{	$sign="plus";
+		$front =~ s/\+$//;
+	}
+	if($front =~ /^-$/)
+	{	$sign="minus";
+		$front =~ s/-$//;
+	}
+
+	if($x =~ /\D$/)
+	{	($back)=( $x =~ /(\D*)$/ );
+		$x =~ s/\D*$//;			# strip back: final . is punct
+	}
+
+	@z=split(/\//,$x);
+	if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]"); return 0;}
+	if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]"); return 0;}
+
+	if($front) 
+	{	&pusho($front);
+		if($front =~ /[a-zA-Z]$/) {&appendo("-");}
+		$appendflg=1;
+	}
+
+	if($sign) {&pusho($sign);}
+
+	if (! &printint($z[0])) { return 0;}			#numerator
+	if($z[1] <= $#den)			# small den from table (<20)
+	{	&pusho($den[$z[1]]);
+		if($z[0]!=1) {if (! &pluralize) {return 0;} }
+	}
+	else					#large den
+	{	$ones=int($z[1]%100);
+		$hun=100*int($z[1]/100);
+		if($hun>0) {if (!&printint($hun)) {return 0;} }
+		if($ones==0) 
+		{	&appendo("th");
+			if($z[0]!=1) {if (! &pluralize) {return 0;} }
+		}
+		elsif($ones<=$#largeden)		# <20
+		{	&pusho($largeden[$ones]);
+			if($z[0]!=1) {if (!&pluralize) {return 0;} }
+		}
+		else
+		{	$x=int($ones%10);
+			if(int($ones/10))
+			{	&pusho($ten[int($ones/10)]);
+				if($x)
+				{	&appendo("-");	# eg. twenty-five
+					$appendflg=1;
+				}
+			}
+			if($x==0)
+			{	&pusho("th");
+        if($z[0]!=1) {if (! &pluralize) {return 0;} }
+			}
+			else
+			{	&pusho($largeden[$x]);
+        if($z[0]!=1) {if (! &pluralize) {return 0;} }
+			}
+		}
+	}
+
+	if($back)
+	{	$x=&geto;	# in case of 1/10th etc ([stndrth]=st nd rd th)
+		if($back !~ /^[stndrth]{2}/ || $x !~ /$back$/)
+		{	if($back =~ /^[a-zA-Z]/) {&appendo("-");}
+			&appendo($back);
+		}
+	}
+  
+  return 1;
+}
+
+sub printnum			# printnum(n)
+{	if($vflg) {print "printnum: $_[0]\n";}
+	local($x)=$_[0];	# print ordinary numbers
+
+	$leadingzeroflg='';			# global
+	local($front);
+	local($back);
+	local($intpart);
+	local($fracpart);
+	local($hun);
+	local($ones);
+	local($comma);
+	local($sign);
+	local($y);
+
+	$x =~ s/^(\D*)//;		# strip front
+	$front=$1;
+	if($front =~ /^\.$/ || $front =~ /\W\.$/ ||
+		($front =~ /\.$/ && $x =~ /^0/ ))		# leading .
+	{	$front =~ s/\.$//;
+		$x = "." . $x;
+	}
+	if($front =~ /^\+$/)		# get sign
+	{	$sign="plus";
+		$front =~ s/\+$//;
+	}
+	if($front =~ /^-$/)
+	{	$sign="minus";
+		$front =~ s/-$//;
+	}
+
+	if($x =~ /\D$/)
+	{	$back=$x;
+		$back =~ s/^[\d\.,]*\d//;
+		$x =~ s/\D*$//;			# strip back: final . is punct
+	}
+
+	if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number"); return 0;}
+
+	if($x!=0 && $x =~ /^0/ && $x =~ /^\d*$/)	# "oh" numbers
+	{	if($front) 
+		{	&pusho($front);
+			if($front !~ /[a-zA-Z]$/) {$appendflg=1;}
+		}
+
+		if($sign) { &pusho($sign); }
+	
+		while($x ne '')
+		{	$x =~ s/^(.)//;
+			&pusho($ones_oh[$1]);
+		}
+
+		if($back)
+		{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
+			{	if (! &pluralize) {return 0;}			# eg. 1960s
+				$back =~ s/^s//;
+			}
+			if($back)
+			{	if($back =~ /^[a-zA-Z]/) {&pusho($back);}
+				else {&appendo($back);}	# back = punct or "'s"
+			}
+		}
+		return 1;
+	}
+
+	if($x =~ /^\d/)			# get integer part
+	{	if($x =~ /,/)
+		{	$comma=1;
+			$x =~ s/,//g;	# strip commas
+		}
+		$intpart=$x;
+		$intpart =~ s/\..*$//;
+		if($x =~ /^0/) {$leadingzeroflg=1;}
+	}
+
+	if($x =~ /\./)			# get fractional part
+	{	$fracpart=$x;
+		$fracpart =~ s/^.*\././;
+	}
+
+	if($front) 
+	{	&pusho($front);
+		if($front !~ /[a-zA-Z]$/) {$appendflg=1;}
+	}
+
+	if($sign) { &pusho($sign); }
+
+	$ones=int($intpart%100);
+	if($comma) {if (! &printint($intpart)) {return 0;} }
+	elsif(($intpart>=1900 || $intpart>=1100 && $ones==0)
+		&& $intpart<2000 && !$fracpart)			#4 digit -> 2+2
+	{	$hun=int($intpart/100);
+		if (! &printint($hun)) {return 0;}
+		if($ones>=10) {if (! &printint($ones)) {return 0;} }
+		elsif($ones>0)
+		{	&pusho("oh");
+			if (! &printint($ones)) {return 0;}
+		}
+		else {&pusho("hundred");}
+	}
+	else
+	{	if (! &printint($intpart)) {return 0;}
+		$y=$last;
+		$y =~ s/^\W*//;				# thize dates: May 25th
+		if(length($intpart)<=2 && $months{$y})
+		{	if (! &thize("")) {return 0;}
+			$back =~ s/[a-z]//g;
+		}
+	}
+	if($fracpart) {if (! &printdecfrac($fracpart)) {return 0;} }
+
+	if($back)
+	{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
+		{	if (! &pluralize) {return 0;}			# eg. 1960s
+			$back =~ s/^s//;
+		}
+		if($back =~ /^st$/ || $back =~ /^st\W/)	# back= st
+		{	if (! &thize("st")) {return 0;}			# eg. 1st
+			$back =~ s/^st//;
+		}
+		if($back =~ /^nd$/ || $back =~ /^nd\W/)	# back= nd
+		{	if (! &thize("nd")) {return 0;}			# eg. 2nd
+			$back =~ s/^nd//;
+		}
+		if($back =~ /^rd$/ || $back =~ /^rd\W/)	# back= rd
+		{	if (! &thize("rd")) {return 0;}			# eg. 3rd
+			$back =~ s/^rd//;
+		}
+		if($back =~ /^th$/ || $back =~ /^th\W/)	# back= th
+		{	if (! &thize("th")) {return 0;}			# eg. 4th
+			$back =~ s/^th//;
+		}
+		if($back)
+		{	if($back =~ /^[a-zA-Z]/) {&pusho($back);}
+			else {&appendo($back);}	# back = punct or "'s"
+		}
+	}
+  return 1;
+}
+
+sub printdate			# printdate(n):	x/x/x format
+{	if($vflg) {print "printdate: $_[0]\n";}
+	local($x)=$_[0];	# print ordinary numbers
+
+	local(@y);
+	local($front);
+	local($back);
+
+	$x =~ s/^(\D*)//;		# strip front
+	$front=$1;
+
+	$x =~ s/(\D*)$//;		# strip back
+	$back=$1;
+
+	if($x !~ /^\d{1,2}\/\d{1,2}\/(19)?\d{2}$/)
+		{&perr("printdate: $_[0] is not a date"); return 0;}
+
+	@y=split(/\//,$x);
+	$y[2] =~ s/^19(\d{2})$/$1/;
+	
+	if($front) 
+	{	&pusho($front);
+		if($front =~ /[a-zA-Z]$/) {&appendo("-");}
+		$appendflg=1;
+	}
+
+	if (! &printint($y[0])) {return 0;}
+	&appendo("/");
+
+	$appendflg=1;
+	if (! &printint($y[1])) {return 0;}
+	&appendo("/");
+
+	$appendflg=1;
+	if (! &printint($y[2])) {return 0;}
+
+	if($back)
+	{	if($back =~ /^[a-zA-Z]/) {&appendo("-");}
+		&appendo($back);
+	}
+  return 1;
+}
+
+sub printserno			# printserno(n): eg. B1, 3b2, 10W-40
+{	if($vflg) {print "printserno: $_[0]\n";}
+	local($x)=$_[0];	# print mixed sequences of dig and let
+
+	local($y);
+	local($z);
+	local($front);
+	local($back);
+
+	$x =~ s/^(\W*)//;		# strip front
+	$front=$1;
+	if($front) 
+	{	&pusho($front);
+		if($front !~ /[a-zA-Z]$/) {$appendflg=1;}
+	}
+
+	$x =~ s/(\W*)$//;		# strip back
+	$back=$1;
+	$x =~ s/(\d[a-zA-Z]+\d+)(\'?s)$/$1/  # strip "s" or "'s"
+	    && ($back = $2 . $back);
+
+	while($x)
+	{	$x =~ s/^(\D*)//;	# strip off non-dig
+		$y=$1;
+		if($y)
+		{	$y =~ s/-//g;	# remove -
+			if($y eq "") {}
+			elsif($sernowd{$y}) {&pusho($y);}	# word
+			else
+			{	while($y)			# spell out
+				{	if($y =~ /[a-zA-Z]\'s$/)
+					{	&pusho($y);
+						$y =~ s/[a-zA-Z]\'s*$//;
+					}
+					elsif($y =~ /[A-Z]s$/)
+					{	&pusho($y);
+						$y =~ s/[A-Z]s$//;
+					}
+					else
+					{	$y =~ s/^(.\.?)//;
+						&pusho($1);
+					}
+				}
+			}
+		}		     # (should expand here unless in dictionary)
+		$x =~ s/^(\d*)//;	# strip off dig
+		$y=$1;
+		if($y ne "") { if (! &printdigstr($y)) {return 0;} }
+	}
+
+	if($back =~ /^s\b/)	# back = s
+	{			# eg. 2C60s
+	    if (! &pluralize) {return 0;} 
+	    $back =~ s/^s//;
+	}
+	if($back)
+	{	if($back =~ /^\w/) {&pusho($back);}
+		else {&appendo($back);}
+	}
+	$appendflg=0;
+  return 1;
+}
+
+sub printdigstr			# printdigstr(x)
+{	if($vflg) {print "printdigstr: $_[0]\n";}
+	local($x)=$_[0];
+
+	local(@y);
+	local($j);
+	local($k);
+
+	if($x =~ /^0/)			# leading zero
+	{	while($x ne "")
+		{	$x =~ s/^(.)//;
+			if($1 !~ /\d/) {&perr("printdigstr: non-digit"); return 0;}
+			&pusho("$ones_z[$1]");
+		}
+		return;
+	}
+	if($x =~ /^\d0*$/)		# d, d0, d00, d000, etc
+	{	return &printint($x);
+	}
+
+	$_=$x;
+	@y=();
+	for($j=0;$_ ne "";$j++) { $y[$j]=chop($_); }	# j=no digits
+	for($k=0;$y[$k]==0;$k++) {}			# k= nr following 0s
+
+	if($j==2)			# 2 dig
+	{	return &printint($x);
+	}
+	if($j==3)
+	{	if (! &printint($y[2])) {return 0;}
+		if($y[1]==0) {&pusho("oh");}
+		return &printint("$y[1]$y[0]");
+	}
+	if($j==5 && $k<=2)
+	{	if (! &printint("$y[4]")) {return 0;}
+		$j=4;
+	}
+	if($j==4)
+	{	if (! &printint("$y[3]$y[2]")) {return 0;}
+		if($k==2) {&pusho("hundred");}
+		else
+		{	if($y[1]==0) {&pusho("oh");}
+			return &printint("$y[1]$y[0]");
+		}
+		return 1;
+	}
+						# >5 dig: just sequential dig
+	for($j--;$j>=0;$j--) {&pusho("$ones_oh[$y[$j]]");}
+  return 1;
+}
+
+sub printftin			# printftin(n): eg. 6\'-4\"
+{	if($vflg) {print "printftin: $_[0]\n";}
+	local($x)=$_[0];	# print mixed sequences of dig and let
+
+	local($y);
+	local($front);
+	local($back);
+
+	$x =~ s/^(\D*)//;		# strip front
+	$front=$1;
+
+	$x =~ s/(\D*)$//;		# strip back
+	$back=$1;
+	$back =~ s/^\"//;		# remove \"
+
+	if($front) 
+	{	&pusho($front);
+		if($front !~ /[a-zA-Z]$/) {$appendflg=1;}
+	}
+
+	$x =~ s/^([\d\.]*)//;	# strip off dig & .
+	$y=$1;
+	if(!$y) {&perr("printftin: bad feet"); return 0;}
+	if (! &printnum($y)) {return 0;}
+	if($y==1) {&appendo("-foot");}
+	else {&appendo("-feet");}
+
+	$x =~ s/^\'//;	# strip off \'
+	$x =~ s/^-//;	# strip off -
+	if(!$x) {&perr("printftin: bad intermed"); return 0;}
+
+	$x =~ s/^([\d\.]*)//;	# strip off dig & .
+	$y=$1;
+	if(!$y) {&perr("printftin: bad inches"); return 0;}
+	if (! &printnum($y)) {return 0;}
+	if($y==1) {&appendo("-inch");}
+	else {&appendo("-inches");}
+
+	if($back)
+	{	if($back !~ /^[a-zA-Z]/) {&appendo($back);}
+		else {&pusho($back);}
+	}
+  return 1;
+}
+
+sub printint			# printint(x)
+{	if($vflg) {print "printint: $_[0]\n";}
+	local($x)=$_[0];
+
+	local($comma);
+	local($leading_zero);
+	local($fractional);
+	local(@y);
+	
+	$fractional=$x =~ /\.\d/;
+	$x =~ s/^\D*([\d,]*)\D*.*$/$1/;	# int part of string
+	$leading_zero=$x =~ /^0/;
+	$comma=$x =~ /,/;
+	$x =~ s/,//g;
+	if($x eq "") {return;}
+
+	if($x == 0)
+	{	&pusho("zero");
+		$leadingzeroflg=1;
+		return;
+	}
+	
+	@y=();
+	for($j=0;$x;$j++) { $y[$j]=chop($x); }
+
+	if($comma || $fractional || 1)
+	{	for($j=3*int($#y/3);$j>=0;$j-=3)
+		{	if($y[$j+2]) { &pusho("$ones_z[$y[$j+2]] hundred");}
+			if($y[$j+1]==1) { &pusho($teen[$y[$j]]);}
+			else
+			{	if($y[$j+1]>1)
+				{	&pusho($ten[$y[$j+1]]);
+					if($y[$j])
+					{	&appendo("-");	# twenty-five
+						$appendflg=1;
+					}
+				}
+				if($y[$j]>0) { &pusho($ones_z[$y[$j]]);}
+			}
+			if(int($j/3)>0)
+			{	if(int($j/3) > $#mult)
+					{ &perr("printint: too big"); return 0;}
+				&pusho($mult[int($j/3)]);
+			}
+			$commanextflg=1;
+		}
+	}
+	$commanextflg=0;
+  return 1;
+}
+
+sub printdecfrac
+{	if($vflg) {print "printdecfrac: $_[0]\n";}
+	local($x)=@_[0];
+	
+	if($x !~ /\.\d/) {return;}
+	$x =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/;		# get fractional part
+
+	&pusho($POINT);
+	@y=split(//,$x);
+	if($leadingzeroflg)
+		{for($j=0;$j<=$#y;$j++) { &pusho($ones_z[$y[$j]]);}}
+	else {for($j=0;$j<=$#y;$j++) { &pusho($ones_oh[$y[$j]]);}}
+
+  return 1;
+}
+
+sub pluralize		# pluralize(): pluralize last entry on output stack
+{	if($vflg) {print "pluralize: $_[0]\n";}
+	local($x);
+
+	$_=&geto;
+	if( /st$/ || /nd$/ || /rd$/ || /th$/ || /quarter$/ || /zero$/ || /oh/ ||
+		/one$/ || /two$/ || /three$/ || /four$/ || /five$/ ||
+		/seven$/ || /eight$/ || /nine$/ ||
+		/ten$/ || /eleven$/ || /twelve$/ || /een$/ ||
+		/hundred$/ || /thousand$/ || /illion$/ )
+	{	&appendo("s");
+	}
+	elsif (/six$/)
+	{	&appendo("es");
+	}
+	elsif (/half$/)
+	{	$x=&popo();
+		$x =~ s/f$/ves/;
+		&pusho($x);
+	}
+	elsif (/ty$/)			# fifty etc.
+	{	$x=&popo();
+		$x =~ s/y$/ies/;
+		&pusho($x);
+	}
+	else {&perr("pluralize: unknown word: $_"); return 0;}
+
+  return 1;
+}
+
+sub thize		# thize(): add th to last entry on output stack
+{	if($vflg) {print "printthize: $_[0]\n";}
+	local($y)=$_[0];
+
+	local($x);
+
+	$_=&geto;
+	if( /four$/ || /six$/ || /seven$/ || /ten$/ ||
+		/eleven$/ || /een$/ || /hundred$/ || /thousand$/ || /illion$/ )
+	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # xth
+		&appendo("th");
+	}
+	elsif( /one$/ )						# 1st
+	{	if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+		$x=&popo();
+		$x =~ s/one$/first/;
+		&pusho($x);
+	}
+	elsif( /two$/ )						# 2nd
+	{	if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+		$x=&popo();
+		$x =~ s/two$/second/;
+		&pusho($x);
+	}
+	elsif( /three$/ )					# 3rd
+	{	if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+		$x=&popo();
+		$x =~ s/three$/third/;
+		&pusho($x);
+	}
+	elsif( /five$/ || /twelve$/ )				# 5th, 12th
+	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+		$x=&popo();
+		$x =~ s/ve$/fth/;
+		&pusho($x);
+	}
+	elsif(/eight$/)
+	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # 8th
+		&appendo("h");
+	}
+	elsif( /nine$/ )
+	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+		$x=&popo();
+		$x =~ s/nine$/ninth/;
+		&pusho($x);
+	}
+	elsif( /ty$/ )
+	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+		$x=&popo();
+		$x =~ s/ty$/tieth/;
+		&pusho($x);
+	}
+	else {&perr("thize: unknown word: $_"); return 0;j}
+  return 1;
+}
+
+sub pusho				# pusho($x): push output
+{	if($commanextflg)		# global: used for commas in printint
+	{	$commanextflg=0;		
+		&appendo(",");
+	}
+	if($appendflg)			# global: used for fronts
+	{	$appendflg=0;		
+		&appendo(@_[0]);
+	}
+	else {push(@output,@_);}
+}
+
+sub appendo				# appendo($x): append to output
+{	$appendflg=0;		
+#	if($#output < 0) {&pusho("");}
+	if($#output < 0) {&perr("appendo: output empty"); return 0;}
+	$output[$#output] .= @_[0];
+}
+
+sub popo				# popo(): pop last output
+{	if($#output < 0) {&perr("popo: output empty"); return 0;}
+	pop(@output);
+}
+
+sub geto				# geto(): get last output
+{	if($#output < 0) {&perr("geto: output empty"); return 0;}
+	return $output[$#output];
+}
+
+sub perr
+{	print STDERR "numproc: $_[0]\n";
+	print STDERR "line number=$.: fields=$last, $this, $next\n";
+#	exit(1);
+
+	$appendflg=0;
+	$commanextflg=0;
+	&pusho($this);
+# $field++;		# graceful error recovery
+}
+
+sub perr2
+{	print STDERR "numproc: $_[0]\n";
+	exit(1);
+}
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
new file mode 100755
index 00000000000..6caf474e3af
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl
+
+# $Id: pare-sgml.perl,v 1.3 1996/08/15 02:51:17 robertm Rel $
+# removes extraneous headers and other non-LM fields
+# translates <DOC ...> into LM-standard <art ...>
+# removes comments (enclosed in brackets)
+
+use strict;
+use warnings;
+
+my $intext=0;
+while (<>)
+{
+    if ($intext == 0)
+    {
+	print if (s=<DOC=<art=);
+	print if (s=</DOC=</art=);
+	$intext = 1 if /^<TEXT>/;
+	next;
+    }
+    if (/^<\/TEXT>/)
+    {
+	$intext = 0;
+	next;
+    }
+    next if /^<comment>/;
+    next if /^<speaker>/;
+
+    s/\[+[^\[\]]*\]+//g;
+    if (/[\[\]]/)
+    {
+	warn "pare-sgml: warning - unbalanced comment brackets at $ARGV line $.\n";
+	print STDERR " line=$_";
+    }
+    print;
+}
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py
new file mode 100755
index 00000000000..3c8a50e3fe4
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py
@@ -0,0 +1,164 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""Prepare CSR-IV 1996 Language model text corpus (LDC98T31)."""
+
+from __future__ import print_function
+import argparse
+import gzip
+import logging
+import os
+import re
+import subprocess
+from bs4 import BeautifulSoup
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    """Parses command-line arguments."""
+
+    parser = argparse.ArgumentParser("""Prepare CSR-IV 1996 Language model text
+    corpus (LDC98T31).""")
+    parser.add_argument("--verbose", choices=[0,1,2,3], type=int, default=0,
+                        help="Set higher for more verbose logging.")
+    parser.add_argument("file_list", type=argparse.FileType('r'),
+                        help="""List of compressed source files""")
+    parser.add_argument("dir", type=str,
+                        help="Output directory to dump processed files to")
+
+    args = parser.parse_args()
+
+    if args.verbose > 2:
+        logger.setLevel(logging.DEBUG)
+        handler.setLevel(logging.DEBUG)
+
+    return args
+
+
+def normalize_text(text):
+    """Normalizes text and returns the normalized version.
+    The normalization involves converting text to upper case.
+    """
+    text1 = text.strip()
+    # text2 = text_normalization.remove_punctuations(text1)
+    text2 = text1.upper()
+    text2 = re.sub(r" [ ]*", " ", text2)
+    return text2
+
+
+def process_file_lines(lines, out_file_handle):
+    """Processes input lines from a file by removing SGML tags and
+    writes normalized plain text to output stream."""
+
+    doc = re.sub(r"<s>", "<s></s>", ''.join(lines))
+    soup = BeautifulSoup(doc, 'lxml')
+
+    num_written = 0
+
+    for art in soup.html.body.children:
+        try:
+            if art.name != "art":
+                continue
+            for para in art.find_all('p'):
+                assert para.name == 'p'
+
+                for x in para.contents:
+                    try:
+                        if x.name is None:
+                            normalized_text = normalize_text(unicode(x))
+                            if len(normalized_text) == 0:
+                                continue
+                            out_file_handle.write("{0}\n".format(
+                                normalized_text.encode('ascii')))
+                            num_written += 1
+                    except Exception:
+                        logger.error("Failed to process content %s in para "
+                                     "%s", x, para)
+                        raise
+
+        except Exception:
+            try:
+                logger.error("Failed to process article %s", art['id'])
+            except AttributeError:
+                logger.error("Failed to process body content %s", art)
+            raise
+    if num_written == 0:
+        raise RuntimeError("0 sentences written.")
+
+
+def run_command(*args, **kwargs):
+    if type(args[0]) is list:
+        command = ' '.join(args[0])
+    else:
+        command = args[0]
+
+    logger.debug("Running command '%s'", command)
+    p = subprocess.Popen(*args, **kwargs)
+    return p, command
+
+
+def run(args):
+    """The one that does it all."""
+
+    for line in args.file_list.readlines():
+        try:
+            file_ = line.strip()
+            base_name = os.path.basename(file_)
+            name = os.path.splitext(base_name)[0]
+
+            out_file = gzip.open("{0}/{1}.txt.gz".format(args.dir, name),
+                                 'w')
+
+            logger.info("Running LM pipefile for |%s|...", base_name)
+
+            p = run_command(
+                "gunzip -c {0} | "
+                "local/data_prep/csr_hub4_utils/pare-sgml.perl | "
+                "perl local/data_prep/csr_hub4_utils/bugproc.perl | "
+                "perl local/data_prep/csr_hub4_utils/numhack.perl | "
+                "perl local/data_prep/csr_hub4_utils/numproc.perl "
+                "  -xlocal/data_prep/csr_hub4_utils/num_excp | "
+                "perl local/data_prep/csr_hub4_utils/abbrproc.perl "
+                "  local/data_prep/csr_hub4_utils/abbrlist | "
+                "perl local/data_prep/csr_hub4_utils/puncproc.perl -np"
+                "".format(file_),
+                stdout=subprocess.PIPE, shell=True)
+
+            stdout = p[0].communicate()[0]
+            if p[0].returncode is not 0:
+                logger.error(
+                    "Command '%s' failed with return status %d",
+                    p[1], p[0].returncode)
+                raise RuntimeError
+
+            process_file_lines(stdout, out_file)
+            out_file.close()
+        except Exception:
+            logger.error("Failed processing file %s", file_)
+            raise
+
+
+def main():
+    """The main function"""
+    try:
+        args = get_args()
+        run(args)
+    except Exception:
+        raise
+    finally:
+        args.file_list.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh
new file mode 100755
index 00000000000..15249ae9a19
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+set -e 
+set -o pipefail
+set -u
+set -x
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <filelist> <dir>"
+  exit 1
+fi
+
+filelist=$1
+dir=$2
+
+export PATH=local/data_prep/csr_hub4_utils:$PATH
+
+for file in `cat $filelist`; do
+	BASENM=`basename $file`
+  name="${BASENM%.*}"
+
+	echo "Running LM pipeline for |$BASENM|..." 1>&2
+  gunzip -c $file | pare-sgml.perl | \
+    bugproc.perl | \
+    numhack.perl | \
+    numproc.perl -xlocal/data_prep/csr_hub4_utils/num_excp | \
+    abbrproc.perl local/data_prep/csr_hub4_utils/abbrlist | \
+    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
+	echo "Done with $BASENM."
+done
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl
new file mode 100755
index 00000000000..891e26d5650
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+# Program:	progsummary.perl
+# Written by:	dave graff
+# Usage:	[file.list]
+# Purpose:	extracts program information from sgml-ized PSM texts
+
+$degbug = 0;
+if ( $ARGV[0] eq "-d" ) {
+    $debug = 1;
+    shift;
+}
+
+while (<>)
+{
+    chop;
+    open( INP, "<$_" );
+    $progdate = $progid = "unknown";
+    while (<INP>) {
+	if ( /^<program>/ ) {
+	    $_ = <INP>;
+	    print STDERR if ( $debug );
+	    $netwrk = substr( $_, 0, 3 );
+	    $rest = substr( $_, 3 );
+	    if ( $rest =~ /^(20\/20)/ ) {
+		$progid = $1;
+	    }
+	    elsif ( $rest =~ /^([A-Z a-z\&]+)/ ) {
+		$progid = $1;
+	    }
+	}
+	elsif ( /^<summary>/ ) {
+	    $_ = <INP>;
+	    print STDERR "$_===\n" if ( $debug );
+	    if ( /\d+\\(\d{6})\\\d+/ ) {
+		$progdate = $1;
+	    }
+	}
+	elsif ( /^<\/art>/ ) {
+	    print "$netwrk\t$progdate\t\"$progid\"\n";
+	}
+    }
+    close INP;
+}
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl
new file mode 100755
index 00000000000..a6e1f19ba56
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl
@@ -0,0 +1,196 @@
+#!/usr/bin/perl
+
+# $Id: puncproc.perl,v 1.2 1996/08/05 16:12:42 robertm Rel $
+###############################################################################
+# This software is being provided to you, the LICENSEE, by the Massachusetts  #
+# Institute of Technology (M.I.T.) under the following license.  By           #
+# obtaining, using and/or copying this software, you agree that you have      #
+# read, understood, and will comply with these terms and conditions:          #
+#                                                                             #
+# Permission to use, copy, modify and distribute, including the right to      #
+# grant others the right to distribute at any tier, this software and its     #
+# documentation for any purpose and without fee or royalty is hereby granted, #
+# provided that you agree to comply with the following copyright notice and   #
+# statements, including the disclaimer, and that the same appear on ALL       #
+# copies of the software and documentation, including modifications that you  #
+# make for internal use or for distribution:                                  #
+#                                                                             #
+# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
+# reserved.                                                                   #
+#                                                                             #
+# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
+# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
+# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
+# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
+# TRADEMARKS OR OTHER RIGHTS.                                                 #
+#                                                                             #
+# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
+# used in advertising or publicity pertaining to distribution of the          #
+# software.  Title to copyright in this software and any associated           #
+# documentation shall at all times remain with M.I.T., and USER agrees to     #
+# preserve same.                                                              #
+###############################################################################
+
+# punctuation preprocessor for WSJ 
+# assumes 1 sentence per line
+# places spaces around punctuation and translates to IBM-like notation
+#
+# punctproc -np removes punctuation
+#
+# NOTE: wsj89 starts single quotes with ` or '
+#
+
+for($i=0,$j=0;$i<=$#ARGV;$i++)
+{	if($ARGV[$i] =~ /^-/)
+	{	if($ARGV[$i] =~ /^-np$/) {$npflg=1;}
+		else {&perr2("illegal flag: $ARGV[$i]");}
+	}
+	else { &perr2("no file args"); }
+}
+@ARGV=();
+
+while(<>)
+{	s/^/ /;
+	s/\n$/ /;
+
+	next if (/<\/?[spa]/);	# protect sgml
+
+						# forbidden symbols
+	if(/</) {&perr("<");}				# <
+	if(/>/) {&perr(">");}				# >
+	if(/\$/) {&perr("\$");}				# $
+	if(/_/) {&perr("_");}				# _
+	if(/\d/) {&perr("[0-9]");}			# 0-9
+
+					# protect contractions with _
+	s/([a-zA-Z]in')([^a-zA-Z])/$1_ $2/g; 	# *in'	e.g. Dunkin', singin'
+						# Rock 'n' Roll
+	s/(\W)['`]([nN])(\W)/$1 _'$2$3/g;	# [`'][nN] ->  _'[nN]
+	s/(\W)([nN]')(\W)/$1$2_ $3/g;		# [nN]'
+	s/(\W)('[eE]m)(\W)/$1_$2$3/g;		# '[eE]m
+	s/(\W)[`'"]R\.?['"](\W)/$1 _"R."_ $3/g;	# toys "R" us
+	s/(\W)(Cos.')(\W)/$1$2_ $3/g;		# Cos.'	(companies')
+	s/(\W)(de.')(\W)/$1$2_ $3/g;		# de'	 Imelda de' Lambertazzi
+	s/(\W)(Bros.')(\W)/$1$2_ $3/g;		# Bros.'
+	s/(\W)(o')(\W)/$1$2_ $3/g;		# o'	 Man o' War
+	s/(\W)(ol')(\W)/$1$2_ $3/g;		# ol'	 old
+	s/(\W)maitre *d'(\W)/$1maitre_d'_ $2/g;	# maitre d'
+	s/(\W)maitres *d'(\W)/$1maitres_d'_ $2/g; # maitres d'
+	s/(\W)('neath)(\W)/$1 _$2$3/g;		# 'neath	 beneath
+	s/(\W)('Wadoo)(\W)/$1 _$2$3/g;
+			# 'Wadoo   'Wadoo , zim bam , boodleoo , hoodle ahdawam
+	s/(\W)('cause)(\W)/$1 _$2$3/g;		 # 'cause	because
+	s/(\W)('burbs)(\W)/$1 _$2$3/g;		# 'burbs	suburbs
+	s/(\W)('[nN]uf)(\W)/$1 _$2$3/g;		# 'Nuf	enough
+	s/(\W)('til)(\W)/$1 _$2$3/g;		# 'til
+
+
+	s/([^\w\.\'\`_ -])/ $1 /g;		# SP around most punct 
+						# but not .'`\_-
+
+	if(!$npflg)
+	{	s/ty-(one)/ty $1/g;		# rm - from twenty-one
+		s/ty-(first)/ty $1/g;		# rm - from twenty-first
+		s/ty-(two)/ty $1/g;		# rm - from twenty-two
+		s/ty-(second)/ty $1/g;		# rm - from twenty-second
+		s/ty-(three)/ty $1/g;		# rm - from twenty-three
+		s/ty-(third)/ty $1/g;		# rm - from twenty-third
+		s/ty-(four)/ty $1/g;		# rm - from twenty-four
+		s/ty-(five)/ty $1/g;		# rm - from twenty-five
+		s/ty-(six)/ty $1/g;		# rm - from twenty-six
+		s/ty-(seven)/ty $1/g;		# rm - from twenty-seven
+		s/ty-(eight)/ty $1/g;		# rm - from twenty-eight
+		s/ty-(nin)/ty $1/g;		# rm - from twenty-nine{th}
+	}
+	#s/([^-])-([^-])/$1 - $2/g;		# -
+	#s/([^-])-([^-])/$1 - $2/g;		# -
+
+	s/([^\.]) *\. *\. *\. *\. *([^\.])/$1 _..._ . $2/g;	# x ... .
+	s/([^\.]) *\. *\. *\. *([^\.])/$1 _..._ $2/g;	# x ...  
+
+	s/([^\w'\.][b-zB-HJ-Z]\.)([^\.\w]*)$/$1 .$2/;	# eg. S. at end -> S. .
+	s/(\s[a-z]\.\s[a-z]\.)([^\.\w]*)$/$1 .$2/i; #eg. S. I. at end -> S. I. .
+	s/(\WMr\.)(\W*)$/$1 . $2/i;		# Mr. at end -> Mr. .
+	s/(\WMrs\.)(\W*)$/$1 . $2/i;		# Mrs. at end -> Mrs. .
+	s/(\WMs\.)(\W*)$/$1 . $2/i;		# Ms. at end -> Ms. .
+	s/(\WMessrs\.)(\W*)$/$1 . $2/i;		# Messrs. at end -> Messrs. .
+
+	s/\.([^.\w]*)$/ . $1/;			# SP around . at end of sent
+
+	s/([^\w\.])['`]([a-zA-Z]*)'(\W)/$1 ' $2 ' $3/g;	# `word'   
+	s/([^\w\.])['`]([a-zA-Z])/$1 ' $2/g;			# 'word
+	s/([^sS])' /$1 ' /g;			# non plural-possessives
+
+	s/([^_])`/$1 ` /g;			# SP around ` (should not need)
+	s/`/'/g;				# ` -> '      (should not need)
+
+	s/_/ /g;				# clear _
+
+	if(!$npflg)
+	{	s/ , / ,COMMA /g;			# map punct to words
+		s/ \? / ?QUESTION-MARK /g;
+		s/ : / :COLON /g;
+		s/ # / #SHARP-SIGN /g;
+		s/ @ / @AT-SIGN /g;
+		s/ ' / 'SINGLE-QUOTE /g;
+		s/ " / "DOUBLE-QUOTE /g;
+		s/ ; / ;SEMI-COLON /g;
+		s/ ! / !EXCLAMATION-POINT /g;
+		s/ & / &AMPERSAND /g;
+		s/ \+ / +PLUS /g;
+		s/ \{ / {LEFT-BRACE /g;
+		s/ \} / }RIGHT-BRACE /g;
+		s/ \( / (LEFT-PAREN /g;
+		s/ \) / )RIGHT-PAREN /g;
+		s/ \. / .PERIOD /g;
+		s/ \.{3} / ...ELLIPSIS /g;
+		s/ -- / --DASH /g;
+		# s/ - / -HYPHEN /g;
+		s/ = / =EQUALS /g;
+		s/ % / %PERCENT /g;
+		s/ \/ / \/SLASH /g;
+		s/ ([b-zB-HJ-Z]) / $1. /g;   # restore . removed by elipsis err
+	}
+	else
+	{	s/ , / /g;			# map punct to words
+		s/ \? / /g;
+		s/ : / /g;
+		s/ # / /g;
+		s/ @ / at /g;
+		s/ ' / /g;
+		s/ " / /g;
+		s/ ; / /g;
+		s/ ! / /g;
+		s/ & / and /g;
+		s/ \+ / plus /g;
+		s/ \{ / /g;
+		s/ \} / /g;
+		s/ \( / /g;
+		s/ \) / /g;
+		s/ \. / /g;
+		s/ \.{3} / /g;
+		s/ -- / /g;
+		s/ ?- ?/ /g;
+		s/ = / equals /g;
+		s/ % / percent /g;
+		s/ \/ / slash /g;
+		s/\.POINT/point/g;
+	}
+} continue {
+	# this block is executed even if we use "next"
+	s/ {2,}/ /g;
+	s/^ //;
+	s/ $//;
+	if($_) {print "$_\n";}
+}
+
+sub perr		#perr(error,line);
+{	print STDERR "punctproc: line no=$.: $_[0]\n";
+	print STDERR "line=$_\n";
+}
+
+sub perr2
+{	print STDERR "num: $_[0]\n";
+	exit(1);
+}
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab b/egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
new file mode 100644
index 00000000000..375f5ddf99b
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
@@ -0,0 +1,411 @@
+A
+ABORTION
+ABOUT
+ACCORDING
+ACCORDINGLY
+ACTORS
+ADDED
+ADDEDON
+ADDING
+ADDS
+ADJUSTED
+ADMITTEDLY
+ADVISERS
+ADVISORY
+AFFLICTED
+AFTER
+AFTERTHOUGHTS
+AGAIN
+ALL
+ALLIANCES
+ALLOCATE
+ALLTIME
+ALMOST
+ALONG
+ALSO
+ALTHOUGH
+ALTOGETHER
+AMID
+AMONG
+AN
+AND
+ANOTHER
+ANY
+APPROVED
+ARCHRIVAL
+ARE
+ARRIVING
+AS
+ASIDE
+ASKED
+ASSUMPTIONS
+AT
+AUGUST
+AVAILABLE
+BACKERS
+BANKERS
+BARRING
+BASED
+BECAUSE
+BEFORE
+BEGINNING
+BEHIND
+BEING
+BESIDES
+BEYOND
+BIG
+BOTH
+BROADLY
+BURNING
+BUT
+BY
+CALLABLE
+CAN
+CEASES
+CHALLENGES
+CHANCES
+CHANGING
+CHARGED
+CHARGES
+CLASSES
+CLEANUPS
+CLEARLY
+COMMUNIST
+COMPETITORS
+COMPLEMENTARY
+COMPLETION
+CONSEQUENTLY
+CONSIDER
+CONSISTING
+CONVERSELY
+CONVICTED
+COULD
+COUNTING
+CURFEWS
+CURRENT
+CURRENTLY
+CUSTOMER
+DEATH
+DECEMBER
+DEFENDERS
+DESCRIBED
+DETAILS
+DIVERSITY
+DO
+DRACONIAN
+DRAFTERS
+DUMPING
+EACH
+EARLIER
+EDUCATIONAL
+EIGHT
+EMBARGO
+EUROPES
+EVEN
+EVENTUALLY
+EVER
+EVERY
+EVERYBODYS
+EVERYONE
+EXAMPLE
+EXCEPT
+EXCLUDING
+EXHAUSTED
+EXPECT
+EXPECTED
+FAR
+FARMERS
+FATAL
+FEW
+FIRST
+FIXED
+FLOATING
+FOLKS
+FOR
+FORMER
+FROM
+FURTHER
+FURTHERMORE
+GIVEN
+HAVE
+HAVING
+HE
+HEADING
+HELPING
+HER
+HERE
+HERES
+HES
+HIGHER
+HIS
+HOLDERS
+HOLDING
+HOW
+HOWEVER
+I
+IF
+ILLEGAL
+IM
+IMPOSED
+IMPROVEMENT
+IN
+INCLUDING
+INCREASINGLY
+INDEED
+INDEPENDENT
+INDICTMENTS
+INFORMING
+INITIAL
+INSTEAD
+INSURERS
+INTENDS
+INTERESTINGLY
+INTRODUCED
+IS
+IT
+ITS
+IVE
+JANUARY
+JUMPS
+JUST
+KNOWN
+LAST
+LATE
+LATER
+LEGALLY
+LESS
+LET
+LIKE
+LIKEWISE
+LIMITS
+LOCATED
+LONGTERM
+LONGTIME
+LOOKING
+LOOKS
+LOSING
+MADE
+MANY
+MARITAL
+MAY
+MAYBE
+MEANWHILE
+MEETING
+MINIMUM
+MONTHLY
+MORE
+MOREOVER
+MOST
+MOSTLY
+MOUNTED
+MR
+MUCH
+MY
+NAMED
+NATURAL
+NATURALLY
+NEARLY
+NEGOTIATORS
+NEITHER
+NEVER
+NEXT
+NINETYDAY
+NOBODY
+NONE
+NONETHELESS
+NOR
+NOT
+NOTABLY
+NOTES
+NOTHING
+NOTING
+NOW
+NOWADAYS
+OBVIOUSLY
+OCCUPATIONAL
+OCTOBER
+OF
+OFFERED
+OFTEN
+ON
+ONCE
+ONE
+ONEYEAR
+ONLY
+OPERATING
+OPINION
+OPPOSITION
+OR
+OTHER
+OTHERS
+OTHERWISE
+OUR
+OUTSIDE
+OVER
+PARENTS
+PART
+PARTICIPATION
+PAYMENT
+PEOPLE
+PESSIMISTS
+PLANTS
+PLEDGED
+PLURALISTIC
+POINTING
+POLICY
+POLITICAL
+POSITIVE
+POSTPONED
+POTENTIAL
+PRESENCE
+PRESSURED
+PREVIOUSLY
+PRODUCERS
+PROFIT
+PROTECTING
+PROTECTIONISM
+PROVISIONAL
+PURELY
+PUT
+QUICK
+QUITE
+RATHER
+REACHED
+READIED
+RECENTLY
+RECOGNITION
+RECOVERIES
+REDEMPTION
+REFERRING
+RELYING
+REMAINING
+REMOVING
+REOFFER
+REPRESENTING
+REQUEST
+RESEARCHERS
+RESULTS
+REVIEWED
+RIOTS
+RIVAL
+RUMORS
+RUSSIAS
+SAYS
+SCORING
+SECRETARIES
+SECTION
+SEEKING
+SELFDEFENSE
+SENDS
+SEPARATELY
+SEPTEMBER
+SEVERAL
+SEXUAL
+SHE
+SHELTER
+SHES
+SHOPKEEPERS
+SHORTLY
+SHOULD
+SIMILARLY
+SINCE
+SLIGHTLY
+SMALL
+SMALLER
+SO
+SOLDIERS
+SOME
+SOON
+SORRY
+SOUGHT
+STEPPED
+STILL
+STUDIES
+SUBSCRIBERS
+SUBSTANTIAL
+SUCH
+SUPPORT
+SURELY
+SWEETHEART
+TALKS
+TAXPAYERS
+THAT
+THATS
+THE
+THEIR
+THEN
+THERE
+THEREAFTER
+THEREFORE
+THERES
+THESE
+THEY
+THEYLL
+THEYRE
+THIS
+THOSE
+THOUGH
+THREATENED
+THROUGH
+THROUGHOUT
+THURSDAY
+THUS
+TO
+TODAY
+TOGETHER
+TONIGHT
+TOO
+TRADITIONALLY
+TRANSFERRED
+TROTTING
+TRUTH
+TUMBLES
+TWOFIFTHS
+TWOTHIRDS
+UNDER
+UNFORTUNATELY
+UNINSURED
+UNLESS
+UNLIKE
+UNTIL
+UPON
+URGED
+USERS
+USING
+USUALLY
+VENTURE
+VERSION
+VIRTUALLY
+WAS
+WATCHING
+WE
+WEDNESDAY
+WEIGHED
+WELCOME
+WELL
+WERE
+WEVE
+WHAT
+WHATS
+WHEN
+WHENEVER
+WHETHER
+WHICHEVER
+WHILE
+WHOEVER
+WHY
+WITH
+WONDERS
+WORST
+WOULD
+WRITTEN
+YEARS
+YES
+YESTERDAY
+YESTERDAYS
+YET
+YOU
+YOULL
+YOUR
+YOURE
+YOUVE
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c b/egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c
new file mode 100644
index 00000000000..af70504d1f1
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c
@@ -0,0 +1,674 @@
+static char rcsid[] = "$Id: sentag.c,v 1.9 1996/08/13 15:57:35 robertm Rel $";
+/*************************************************************
+ * sentag.c
+ *------------------------------------------------------------
+ * Intended to do the best possible sentence tagging of
+ * text data from journalistic sources.  Input format is
+ * the typical TIPSTER-style SGML, in which the critical
+ * tags required are indicated below, and other tags are
+ * passed through without modifications:
+ *
+ *	<DOC id=artid-string>
+ *	...
+ *	<TEXT>
+ *	<p>
+ *	All text should be prepared with one paragraph on a line, regardless \
+ *	how long it is (up to 65536 chars).
+ *	<p>
+ *	The sentag program will make changes within the "TEXT" region only. This \
+ *	is an example.
+ *	<p>
+ *	In addition to putting one whole paragraph on one line, other cleaning up \
+ *	may be needed so that output sentences are tidy. This might include removing \
+ *	"datelines", etc.
+ *	<p>
+ *	Note that closing tags are implicit for paragraphs. The same will apply to \
+ *	sentence tags in the output.
+ *	</TEXT>
+ *	...
+ *	</DOC>
+ *
+ * Output format is:
+ *
+ *	<DOC id=artid-string>
+ *	...
+ *	<TEXT>
+ *	<p id=artid-string.1>
+ *	<s>
+ *	All text should be prepared with one paragraph on a line, regardless \
+ *	how long it is (up to 65536 chars).
+ *	<p id=artid-string.2>
+ *	<s>
+ *	The sentag program will make changes within the "TEXT" region only.
+ *	<s>
+ *	This is an example.
+ *	<p id=artid-string.3>
+ *	<s>
+ *	In addition to putting one whole paragraph on one line, other cleaning up \
+ *	may be needed so that output sentences are tidy.
+ *	<s>
+ *	This might include removing "datelines", etc.
+ *	<p id=artid-string.4>
+ *	<s>
+ *	Note that closing tags are implicit for paragraphs.
+ *	<s>
+ *	The same will apply to sentence tags in the output.
+ *	</TEXT>
+ *	...
+ *	</DOC>
+ *
+ * In a nutshell, this program applies unique ID strings to all
+ * paragraph tags, inserts an initial <s> tag at the start of each
+ * paragraph, and for each period "." character that marks the end of
+ * a sentence within a paragraph, it replaces the following space with
+ * "\n<s>\n".
+ *
+ * This program operates as a pipeline filter.
+ *
+ * By default, it looks in "./addressforms" for a list of
+ * sentence-internal abbreviations, and in "./sent-init.vocab" for a
+ * list of words that would only be capitalized at the beginning of a
+ * sentence.  The arguments "-a abbrevfile" and "-i sent-init.list"
+ * can override the defaults.
+ *
+ * If either "abbrev" or "sent-init" file is not found, the program exits.
+ *
+ * A "sent-init.candidate" file is created, containing all the cases
+ * in which a capitalized word following a period has been _assumed_
+ * to be a continuation of an abbreviated proper noun phrase
+ * (e.g. U.S. Treasury).  This "candidate" file (and a histogram of
+ * its tokens) should be reviewed to look for (classes of) possible
+ * missed boundaries.  Sentence breaks are NOT applied to these cases,
+ * and a second pass over the same input data should be made if the
+ * "sent-init" file is updated to include any of these candidates.
+ * The argument "-t candidate.file" will override the default name.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <search.h>
+
+#define BUFSIZE 65536
+#define MAXABRV 2048
+#define MAXIVCB 1024
+#define MAXBRKS 256
+#define IDLEN   64
+#define MAXSENTLEN 4096
+
+char *abbrevs[MAXABRV];		/* contains sentence-internal abbrevs */
+char idstr[IDLEN];
+struct si_word {
+    char *wd;
+} si_node, s_init_wd[MAXIVCB];	/* contains non-capitalized words */
+
+int n_abbrevs = 0;
+int n_mid_abbrevs, n_s_init = 0, pid;
+
+FILE *tfp;
+
+/* --------------------------------------------------
+ * w_compare() : comparison function for bsearch()
+ */
+int w_compare( w1, w2 )
+  struct si_word *w1, *w2;
+{
+    return strcmp( w1->wd, w2->wd );
+}
+
+
+main( ac, av )
+  int ac;
+  char **av;
+{
+    FILE *afp, *ifp;
+    int c, i, j, inText;
+    char buf[BUFSIZE], *cp;
+    extern int optind, opterr;
+    extern char *optarg;
+    int w_compare();
+
+/* Handle options or defaults
+ */
+    afp = ifp = tfp = NULL;
+    while (( c = getopt( ac, av, "a:i:t:" )) != -1 )
+        switch ( c )
+        {
+	  case 'a':
+	    if (( afp = fopen( optarg, "r" )) == NULL ) {
+		fprintf( stderr, "Unable to open abbrev file %s\n", optarg );
+		exit(1);
+	    }
+	    break;
+	  case 'i':
+	    if (( ifp = fopen( optarg, "r" )) == NULL ) {
+		fprintf( stderr, "Sent-init.vocab file %s not found.\n", optarg );
+		exit(1);
+	    }
+	    break;
+	  case 't':
+	    if (( tfp = fopen( optarg, "w" )) == NULL ) {
+		fprintf( stderr, "Can't create %s -- quitting.\n", optarg );
+		exit(1);
+	    }
+	    break;
+	  default:
+	    fprintf( stderr, "Usage: %s [-a abbrevs] [-i sent-init.vocab]\n", av[0] );
+	    fprintf( stderr, "version: %s\n", rcsid );
+	    exit(1);
+	}
+
+/* Always create a table of uncertain capitalized words
+ */
+    if ( ! tfp && ( tfp = fopen( "sent-init.candidate", "a" )) == NULL ) {
+	fprintf( stderr, "Can't create/append-to ./sent-init.candidate\n" );
+	exit(1);
+    }
+
+/* Load typical sentence-initial words (capitalized only when sentence-intial)
+ * -- input list file must be presorted alphabetically
+ */
+    if ( ! ifp && ( ifp = fopen( "sent-init.vocab", "r" )) == NULL ) {
+	fprintf( stderr, "File ./sent-init.vocab not found.\n" );
+	exit(1);
+    }
+    while ( n_s_init < MAXIVCB && fgets( buf, BUFSIZE, ifp ) != NULL )
+	if ( buf[0] != '#' )
+	    s_init_wd[ n_s_init++ ].wd = strdup( strtok( buf, "\n" ));
+    fclose( ifp );
+
+/* Load definite within-sentence abbrevs
+ */
+    if ( ! afp && ( afp = fopen( "addressforms", "r" )) == NULL ) {
+	fprintf( stderr, "Unable to open file ./addressforms\n" );
+	exit(1);
+    }
+    while ( n_abbrevs < MAXABRV && fgets( buf, BUFSIZE, afp ) != NULL )
+	if ( buf[0] != '#' )
+	    abbrevs[ n_abbrevs++ ] = strdup( strtok( buf, "." ));
+    fclose( afp );
+    n_mid_abbrevs = n_abbrevs;
+
+/* Add some special abbrevs to the list
+ */
+    abbrevs[ n_abbrevs++ ] = strdup( "Dr" );
+    abbrevs[ n_abbrevs++ ] = strdup( "St" );
+
+/* Scan and tag text data
+ */
+    inText = 0;
+    *idstr = 0;
+    while ( gets( buf ))
+    {
+	if (strlen(buf) > BUFSIZE)
+	  {
+	    fprintf( stderr, "input buffer size exceeded!!\n" );
+	    fprintf( stderr, "last input:\n%s\n", buf );
+	    exit(-1);
+	  }
+	if ( !inText ) {
+	    if ( buf[0] == '<' )
+		switch ( buf[1] )
+		{
+		  case 'D':
+		    if ( !strncmp( buf, "<DOC id=", 8 )) {
+			strcpy( idstr, &buf[8] );
+			if (( cp = strchr( idstr, '>' )) != NULL )
+			    *cp = 0;
+			else
+			  fprintf( stderr, "bad ID??\nid=%s\n", idstr );
+		    }
+		    break;
+		  case 'T':
+		    if ( !strncmp( buf, "<TEXT>", 6 )) {
+			if ( ! *idstr ) {
+			    fprintf( stderr, "No DOCID string -- quitting.\n" );
+			    exit(1);
+			}
+			inText = 1;
+			pid = 0;
+		    }
+		    break;
+		  default:
+		    break;
+		}
+	    puts( buf );
+	}
+	else {
+	    if ( buf[0] == '<' )
+		switch ( buf[1] )
+		{
+		  case 'p':
+		    pid++;
+		    printf( "<p id=%s.%d>\n", idstr, pid );
+		    break;
+		  case '/':
+		    if ( !strncmp( buf, "</TEXT>", 7 ))
+		      inText = 0;
+		    puts( buf );
+		    break;
+		  default:
+		    if (( !strncmp( buf, "<speaker>", 9 ))
+		        || ( !strncmp( buf, "<comment>", 9 )))
+		      {
+			puts( buf );
+		      }
+		    else
+		      {
+			fprintf( stderr, "Warning: passing odd markup in %s:\n\t%s\n", idstr, buf );
+			puts( buf );
+		      }
+		}
+	    else {
+		strcat( buf, " " );
+		sentBreak( buf );
+	    }
+	}
+    }
+    exit(0);
+}
+
+
+char *ucs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+char *lcs = "abcdefghijklmnopqrstuvwxyz";
+char *crp_abbrv[] = { "CORP", "INC", "CO", "PLC", "LTD", "BHD", "CIE",
+		      "DEPT", "LTDA", "MFG", "SPA" };
+int n_crp_abbrv = 11;
+char *time_zone[] = { "EST", "EDT", "PST", "PDT", "CST", "CDT", "MST", "MDT", "GMT" };
+int n_time_zone = 9;
+
+#define MAXWDLEN 64
+#define DoNextPeriod continue
+
+sentBreak( buf )
+  char *buf;
+{
+    char *period[MAXBRKS], *start, perchr, nxtwd[MAXWDLEN];
+    char *nxtch, *nxtuc, *nxtsp, *prvch, *prvsp, *endwd, *prvwd, *endpg;
+    char *openbracketp;
+    int n_per, i, j, k;
+
+    n_per = 0;
+    nxtuc = start = buf;
+    endpg = buf + strlen( buf ) -1;
+
+ /* Locate all possible sentence terminations in this paragraph;
+  * if none, print what we have as a sentence.
+  */
+    openbracketp=0;
+    for(nxtsp = buf; *nxtsp != NULL ; nxtsp++)
+      switch (*nxtsp)
+	{
+	case '[':
+	  if ( strchr(nxtsp,']') != NULL )
+	    openbracketp=nxtsp;
+	  break;
+	case ']':
+	  if (openbracketp && n_per
+	      && period[n_per-1]+4 > openbracketp
+	      && strchr(".!?",*(nxtsp-1)))
+	    period[n_per-1]=nxtsp-1;
+	  openbracketp=0;
+	  break;
+	case '.':
+	case '?':
+	case '!':
+	  if (openbracketp) continue;
+	  period[n_per++] = nxtsp;
+	  if (n_per >= MAXBRKS)
+	    {
+	      fprintf(stderr,
+		      "MAXBRKS exceeded - more than %d `periods' in\n%s\n",
+		      MAXBRKS, buf);
+	      exit(-1);
+	    }
+	  break;
+	default:
+	  break;
+	}
+    
+    if ( ! n_per ) {
+        /* if ( endpg - buf > 3 && strchr(( endpg-2 ), ':' ) != NULL ) */
+        tagSentence( buf, endpg );
+	return;
+    }
+
+ /* Check each possible sentence break, using a variety of
+  * heuristics...  At each stage, if evidence indicates a
+  * clear decision, write the tagged sentence if appropriate,
+  * and continue on to the next candidate.
+  */
+    for ( i=0; i<n_per; i++ )
+    {
+	nxtch = period[i];
+	prvch = period[i] -1;
+
+ /* For this to be a valid break, there must be a space
+  * and an upper-case letter following
+  */
+	if (( nxtuc = strpbrk( period[i], ucs )) == NULL ||
+	    ( nxtsp = strchr( period[i], ' ' )) == NULL )
+	    DoNextPeriod;
+
+ /* If a digit or other punctuation follows before the next
+  * space, this cannot be a sentence break (this handles
+  * medial periods in strings of initials, like "U.S.", "p.m."
+  */
+	while ( ++nxtch < nxtsp )
+	    if ( strchr( ".,;:-?!'0123456789", *nxtch ))
+		break;
+	if ( nxtch < nxtsp && ( *nxtch != '\'' || strchr( " st", *(nxtch+1) )))
+	    DoNextPeriod;
+	else
+	{
+ /* Before going on, check whether nxtuc precedes nxtsp; if so,
+  * this is probably a typo (space after period was elided);
+  * we should fix it and continue to treat this as a candidate
+  */
+	    if ( nxtuc < nxtsp ) {
+		for ( endwd = ++endpg; endwd > period[i]; endwd-- )
+		    *(endwd+1) = *endwd;
+		*(++endwd) = ' ';
+		for ( j=i+1; j<n_per; j++ )
+		    period[j]++;
+		nxtsp = nxtuc++;
+            }
+        }
+
+ /* Make sure nxtsp points as far to the right as possible
+  * before checking distance to nxtuc; allowable distance is
+  * up to 3 chars, to allow for intervening quote and paren.
+  * (but don't allow an intervening space)
+  * (Now allows for many intervening bracketed expressions as well.)
+  */
+	while ( *( nxtsp +1 ) == ' ' )
+	    nxtsp++;
+	if (( nxtuc > nxtsp + 3 ||
+	      ( nxtuc == nxtsp + 3 && *( nxtuc -1 ) == ' ' ))
+	    && (! (( *(nxtsp+1) == '[' )
+		   && ( strchr( nxtsp, ']') +2 == nxtuc )
+		   && ( strchr( ".!?", *(nxtuc-3)) == NULL))))
+	    DoNextPeriod;
+
+ /* If next token after period is a corporate abbrev, this is
+  * not a break
+  */
+	j = k = 0;
+	while ( k < MAXWDLEN && nxtuc[j] != ' ' ) {
+	    if ( isalpha( nxtuc[j] ))
+		nxtwd[k++] = toupper( nxtuc[j] );
+	    j++;
+	}
+	if ( k < MAXWDLEN ) {
+	    nxtwd[k] = 0;
+	    for ( j=0; j<n_crp_abbrv; j++ )
+		if ( !strcmp( nxtwd, crp_abbrv[j] ))
+		    break;
+	    if ( j < n_crp_abbrv )
+		DoNextPeriod;
+	} else {
+	    fprintf( stderr, "TYPO? <p id=%s.%d> %s\n", idstr, pid, start );
+	    DoNextPeriod;
+	}
+
+ /* Inspect the token that precedes the period
+  */
+	perchr = *period[i];
+	*period[i] = 0;
+
+	if (( prvsp = strrchr( start, ' ' )) != NULL )
+	{
+
+ /* This block looks at a pre-break token that is not sentence-initial.
+  * Make sure we point to the first alphanumeric character, if any
+  */
+	    endwd = prvsp +1;
+	    while ( *endwd && !isalnum( *endwd ))
+		*endwd++;
+	    if ( ! *endwd ) { /* This was probably an ellipsis "..." */
+		*period[i] = perchr;
+		tagSentence( start, nxtsp );
+		start = nxtsp + 1;
+		DoNextPeriod;
+	    }
+    
+ /* - if token ends in a bracket or quote, this is a clear sentence break
+  */
+	    if ( strchr( "\")}]", *prvch ))
+	    {
+		*period[i] = perchr;
+		tagSentence( start, nxtsp );
+		start = nxtsp + 1;
+		DoNextPeriod;
+	    }
+
+ /* - if token does not begin with upper-case, and is not a time designation
+  *	("a.m" or "p.m") followed by a time-zone name, and is not "vs" or "excl",
+  *	then this is a real break
+  */
+	    if ( !isupper( *endwd )) {
+		if ( strstr( endwd, ".m" )) {
+		    for ( j=0; j<n_time_zone; j++ )
+			if ( !strcmp( nxtwd, time_zone[j] ))
+			    break;
+		    if ( j < n_time_zone ) {
+			*period[i] = perchr;
+			DoNextPeriod;
+		    }
+		}
+		if ( strcmp( endwd, "vs" ) && strcmp( endwd, "excl" )) {
+		    *period[i] = perchr;
+		    tagSentence( start, nxtsp );
+		    start = nxtsp + 1;
+		}
+		*period[i] = perchr;
+		DoNextPeriod;
+	    }
+
+ /* - if it is one of the definite within-sentence abbrevs,
+  *	this clearly is not a sentence break
+  */
+	    for ( j=0; j<n_mid_abbrevs; j++ )
+		if ( !strcasecmp( endwd, abbrevs[j] ))
+		    break;
+	    if ( j < n_mid_abbrevs ) {
+		*period[i] = perchr;
+		DoNextPeriod;
+	    }
+
+ /* - if it is "Dr" or "St", preceded by a capitalized word,
+  *	with only a space intervening, this could be a valid break,
+  *	but unlikely -- just issue a warning and don't call it a break
+  */
+	    for ( ; j<n_abbrevs; j++ )
+		if ( !strcasecmp( endwd, abbrevs[j] ))
+		    break;
+	    if ( j < n_abbrevs ) {
+		*prvsp = 0;
+		prvwd = strrchr( start, ' ' );
+		if ( prvwd == NULL ) {
+		    *prvsp = ' ';
+		    *period[i] = perchr;
+		    DoNextPeriod;
+		}
+		while ( *prvwd && !isalpha( *prvwd ))
+		    prvwd++;
+		
+		if ( ! *prvwd || !isupper( *prvwd ) ||
+		      strpbrk( prvwd, ",.:;\"')]}" ) ||
+		      strchr( "{[(\"`", *(prvsp+1) )) {
+		    *prvsp = ' ';
+		    *period[i] = perchr;
+		    DoNextPeriod;
+		}
+		*prvsp = ' ';
+		*period[i] = perchr;
+		fprintf( stderr, "ADR? <p id=%s.%d> %s\n", idstr, pid, start );
+		DoNextPeriod;
+	    }
+
+ /* - if it is a single letter, this is almost certainly
+  *	not a real break (it's a first or middle initial)
+  */
+	    if ( strlen( endwd ) == 1 ) {
+		*period[i] = perchr;
+		DoNextPeriod;
+	    }
+
+ /* At this point, we are looking at a non-initial multi-char token that
+  * begins with upper-case, is not a clear mid-sentence abbrev, and is
+  * followed by a capitalized word that is not a corporate abbrev.
+  * If the "period" character is actually "?" or "!", OR (the token
+  * contains lower case and, if a corp-abbrev, is not followed by "(")
+  * then this is almost certainly a real break (if it is a corp-abbrev
+  * followed by "(", this is most likely not a break)
+  */
+	    if ( perchr != '.' ) {
+		*period[i] = perchr;
+		tagSentence( start, nxtsp );
+		start = nxtsp + 1;
+		DoNextPeriod;
+	    }
+	    if ( strpbrk( endwd, lcs )) {
+		for ( j=0; j<n_crp_abbrv; j++ )
+		    if ( !strcasecmp( endwd, crp_abbrv[j] ))
+			break;
+		*period[i] = perchr;
+		if ( j == n_crp_abbrv || *(nxtsp+1) != '(' ) {
+		    tagSentence( start, nxtsp );
+		    start = nxtsp + 1;
+		}
+		DoNextPeriod;
+	    }
+
+ /* Now we reach the truly ambiguous case: a sequence of upper-case
+  * (possibly initials) followed by a capitalized token (e.g. "U.S.
+  * Treasury" or "A.G. Edwards"; if it is an acronym (e.g. "NASA"),
+  * this is most likely a real break.
+  */
+	    if ( strspn( endwd, ucs ) == strlen( endwd )) {
+		*period[i] = perchr;
+		tagSentence( start, nxtsp );
+		start = nxtsp + 1;
+		DoNextPeriod;
+	    }
+
+ /* Finally, we must determine whether the next token is likely to
+  * be a sentence-initial word, rather than a continuation of a
+  * proper name (i.e. "U.S. The" vs. "U.S. Navy" -- failing this
+  * criterion does not mean we don't have a break, but the error
+  * rate of calling it a non-break is diminished.  As a result, the
+  * predominant error should be run-on sentences (missed breaks).
+  */
+	    si_node.wd = nxtwd;
+	    if ( bsearch((char *)(&si_node), (char *)s_init_wd, n_s_init,
+			 sizeof(si_node), w_compare ))
+	    {
+		*period[i] = perchr;
+		tagSentence( start, nxtsp );
+		start = nxtsp + 1;
+	    }
+	    else
+	    {
+		*period[i] = perchr;
+		fprintf( tfp, "%s <%s.%d>\n", nxtwd, idstr, pid );
+	    }
+	    DoNextPeriod;
+
+	} /* prvsp != NULL */
+
+	else
+
+	{ /* prvsp == NULL */
+ /* This block looks at a sentence-initial token preceding
+  * the period; if "period" is acually "?!", or if the token
+  * looks like any kind of abbreviation, this is not a real break
+  */
+	    if ( perchr != '.' ) {
+		*period[i] = perchr;
+		tagSentence( start, nxtsp );
+		start = nxtsp + 1;
+		DoNextPeriod;
+	    }
+	    endwd = start;
+	    while ( *endwd && !isalpha( *endwd ))
+		endwd++;
+	    if ( ! *endwd ) {
+		*period[i] = perchr;
+		DoNextPeriod;
+	    }
+	    for ( j=0; j<n_abbrevs; j++ )
+		if ( !strcasecmp( endwd, abbrevs[j] ))
+		    break;
+	    if ( j < n_abbrevs || strlen( endwd ) == 1 || strchr( endwd, '.' )) {
+		*period[i] = perchr;
+		DoNextPeriod;
+	    }
+	    *period[i] = perchr;
+	    tagSentence( start, nxtsp );
+	    start = nxtsp + 1;
+	    DoNextPeriod;
+	}
+    } /* for (i=0; i<n_per; i++ ) */
+
+/* If there is still character data in the buffer, call it a sentence
+ */
+    if ( start + 2 < endpg )
+      if ( *endpg == ' ')
+	tagSentence( start, endpg, idstr );
+      else
+	tagSentence( start, (endpg + 1), idstr );
+}
+
+
+tagSentence( start, end )
+  char *start, *end;
+{
+    char sent[MAXSENTLEN], *si, *so;
+    int alpha, len;
+
+    len = (end - start) + 2;
+    if ( len > MAXSENTLEN )
+      {
+	fprintf( stderr, "Warning: in %s, ", idstr );
+	fprintf(stderr,"sentence length of %d exceeds MAXSENTLEN (%d)\n",
+		len,MAXSENTLEN);
+	strncpy(sent,start,75);
+	sent[75]=0;
+	fprintf(stderr,"ignoring `sentence' beginning with:\n %s\n",
+		sent);
+	return;
+      }
+
+    si = start;
+    so = sent;
+    alpha = 0;
+
+    while ( si < end ) {
+	alpha |= (! isspace( *si ));
+	*so++ = *si++;
+    }
+    *so = 0;
+
+    if ( ! alpha )
+	return;
+
+    printf( "<s>\n%s\n", sent );
+}
+
+/*
+unpicky_tagSentence( start, end )
+  char *start, *end;
+{
+    if ( start >= end ) {
+      fprintf( stderr, "ignoring bad sentence mark (%x !< %x) in %s\n",
+	       start, end, idstr );
+      fprintf( stderr, "`sentence' from start-pointer:\n%s\n", start );
+      return;
+    }
+    printf("<s>\n");
+    while ( start < end )
+	putchar(*start++);
+    putchar('\n');
+}
+*/
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
new file mode 100755
index 00000000000..947ee28e2dc
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
@@ -0,0 +1,13 @@
+#!/usr/bin/perl -pi.old-char
+
+# handles nonprinting characters in Broadcast News material, to the extent
+# that they can be handled, and perhaps a bit beyond...
+
+tr/\xc4\x82\x90\xa4\x89\x8a\x87\xe9/-eEneece/;
+
+s=\xae=<<=g;
+s=\xaf=>>=g;
+s=\xab= 1/2=g;
+s=\xac= 1/4=g;
+s=\xf8= degrees=g;
+s=\xf1= plus or minus =g;
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
new file mode 100755
index 00000000000..8dc87917c0c
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl -p
+
+# handles nonprinting characters in Broadcast News material, to the extent
+# that they can be handled, and perhaps a bit beyond...
+
+s=\xc4=-=g;
+s=\xae=<<=g;
+s=\xaf=>>=g;
+s=\x82=e=g;	# e' (�) in IBMPC
+s=\xab= 1/2=g;
+# next most frequent, \xfa, appears to have various use as hard-space,
+#  hard-return, or noise
+s=\x90=E=g;	# E' (�) in IBMPC
+s=\xa4=n=g;	# n~ (�) in IBMPC
+s=\xac= 1/4=g;
+# ^G => noise
+# ^A => noise
+s=\xf8= degrees=g;
+# \x1b => noise?
+# \x02 => noise?
+
+# remainder occur 4 or fewer times each -- may be better to do by hand?
+s=\x89=e=g;	# e: or E:
+s=\xf1= plus or minus =g;
+# \xc9 = graphics character => ???
+# \x03 => noise?
+# \x04 => noise?
+s=\x8a=e=g;	# e` (�) in IBMPC
+s=\x87=c=g;	# c, (�) in IBMPC
+s=\xe9=e=g;	# e' (�) in ISO!!
+# \xad => spanish inverted question mark (�), appears (with Spanish) twice!
+s=\xad==g;
+
+# remainder occur only once each -- probably best to check by hand
+# \xff
+# \xdd
+# \xbb
+# \xa1
+# \x8d
+# \x81
+# \x1c
+# \x1a
+# \x16
+# \x11
+# \x10
+# \x0c
diff --git a/egs/bn/s5/local/data_prep/do-lm-csr96 b/egs/bn/s5/local/data_prep/do-lm-csr96
new file mode 100755
index 00000000000..eec6791904f
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/do-lm-csr96
@@ -0,0 +1,40 @@
+#!/bin/sh
+# $Id: do-lm,v 1.3 1996/08/23 22:43:23 robertm Rel $
+Usage()
+{
+cat << EOM 1>&2
+Usage: $0 file(s)
+  Runs LM pipeline on FILES, with output to "lm" subdirectory of cwd.
+  Expects to find LM conditioning tools in PATH or ./bin.
+EOM
+}
+
+# Excludes "fixvp" stage which has the main effect of killing off
+# any SGML tagging that contains a space, e.g. <p id=...>.
+
+# BBN used -np switch for puncproc, removing punctuation; this chooses the
+# "verbalize" option instead.
+
+# Includes new "numhack" module to deal with zip codes and phone numbers.
+
+if [ $# -eq 0 ] || [ $1 = "-h" ]; then
+	Usage
+	exit 1
+fi
+
+PATH=$PATH:./bin ; export PATH
+
+for file in $*
+do
+	BASENM=`basename $file`
+	echo "Running LM pipeline for |$BASENM|..." 1>&2
+	set -x
+	perl pare-sgml.perl $file |
+	 perl bugproc.perl |
+	 perl numhack.perl |
+	 perl numproc.perl |
+	 perl abbrproc.perl |
+	 perl puncproc.perl > lm/$BASENM
+	set +x
+	echo "Done with $BASENM."
+done
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh
new file mode 100755
index 00000000000..fc20758eec0
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh
@@ -0,0 +1,51 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+stage=0
+
+. path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T31/ data/local/data/csr96_hub4"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+ls $SOURCE_DIR/1996_csr_hub4_model/st_train/*.stZ \
+  $SOURCE_DIR/1996_csr_hub4_model/st_test/*.stZ | sort > \
+  $dir/filelist
+
+mkdir -p $dir/split$nj/
+
+if [ $stage -le 1 ]; then
+  eval utils/split_scp.pl $dir/filelist $dir/split$nj/filelist.{`seq -s, $nj`}
+  $cmd JOB=1:$nj $dir/log/process_text.JOB.log \
+    local/data_prep/csr_hub4_utils/process_filelist.py \
+    $dir/split$nj/filelist.JOB $dir
+fi
+
+for x in `ls $SOURCE_DIR/1996_csr_hub4_model/st_train/*.stZ`; do
+  y=`basename $x`
+  name=${y%.stZ}
+  echo $dir/${name}.txt.gz
+done > $dir/train.filelist
+
+for x in `ls $SOURCE_DIR/1996_csr_hub4_model/st_test/*.stZ`; do
+  y=`basename $x`
+  name=${y%.stZ}
+  echo $dir/${name}.txt.gz
+done > $dir/test.filelist
diff --git a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
new file mode 100755
index 00000000000..a167c2cfee0
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
@@ -0,0 +1,87 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "local/prepare_1998_hub4_bn_eng_eval.sh /export/corpora/LDC/LDC2000S86/ data/local/data/eval98"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+for uem in $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.uem; do
+  python -c '
+import sys, os
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  line = line.strip()
+  if len(line) == 0 or line[0:2] == ";;":
+    continue
+  parts = line.split()
+
+  assert parts[1] == "1"
+  start_time = float(parts[2])
+  end_time = float(parts[3])
+  
+  utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), 
+                                     int(end_time * 100))
+  print ("{0} {1} {2} {3}".format(utt, reco, start_time, end_time))' $uem 
+done > $dir/segments
+
+cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \
+  python -c '
+from __future__ import print_function
+import sys
+
+segments_handle = open(sys.argv[1], "w")
+utt2spk_handle = open(sys.argv[2], "w")
+for line in sys.stdin.readlines():
+  line = line.strip()
+  if len(line) == 0 or line[0:2] == ";;":
+    continue
+  parts = line.split()
+
+  reco = parts[0] 
+  assert parts[1] == "1"
+  spk = parts[2]
+  start_time = float(parts[3])
+  end_time = float(parts[4])
+  
+  utt = "{spk}-{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), 
+                                           int(end_time * 100), spk=spk)
+
+  print ("{0} {1} {2} {3}".format(utt, reco, start_time, end_time),
+         file=segments_handle)
+  print ("{0} {1}".format(utt, spk),
+         file=utt2spk_handle)
+segments_handle.close()
+utt2spk_handle.close()
+' $dir/segments.pem $dir/utt2spk.pem
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/wav.scp
+
+awk '{print $1" "$1" 1"}' $dir/wav.scp > $dir/reco2file_and_channel
+
+cp $SOURCE_DIR/h4e_evl/h4e_98.glm $dir/glm
+cp $SOURCE_DIR/h4e_evl/h4e_98.stm $dir/stm
+
+awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
+
+utils/fix_data_dir.sh $dir
+utils/copy_data_dir.sh $dir ${dir}.pem
+
+cp $dir/segments.pem ${dir}.pem/segments
+cp $dir/utt2spk.pem ${dir}.pem/utt2spk
+utils/fix_data_dir.sh ${dir}.pem
diff --git a/egs/bn/s5/local/data_prep/prepare_bn_data.py b/egs/bn/s5/local/data_prep/prepare_bn_data.py
new file mode 100755
index 00000000000..b96d0503367
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_bn_data.py
@@ -0,0 +1,208 @@
+#! /usr/bin/env python
+
+from __future__ import print_function
+import argparse
+import glob
+import logging
+import os
+import re
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    parser = argparse.ArgumentParser("Prepare BN corpus.")
+    parser.add_argument("--split-at-sync", type=str,
+                        choices=["true", "false"], default="false",
+                        help="If true, creates separate segments split "
+                        "at each sync tag.")
+    parser.add_argument("audio_source_dir", type=str,
+                        help="Source directory of audio of BN corpus "
+                        "(LDC97S44)")
+    parser.add_argument("text_source_dir", type=str,
+                        help="Source directory of text of BN corpus "
+                        "(LDC97T22)")
+    parser.add_argument("dir", type=str,
+                        help="Output directory to write the kaldi files")
+
+    args = parser.parse_args()
+
+    args.split_at_sync = bool(args.split_at_sync == "true")
+    return args
+
+
+class Segment(object):
+    """A class to store a segment with start time, end time, recording id,
+    speaker, and the text.
+    """
+    def __init__(self, reco_id, speaker=None):
+        self.reco_id = reco_id
+        self.text = None
+        self.start_time = -1
+        self.end_time = -1
+        if speaker is not None:
+            self.speaker = speaker
+        else:
+            self.speaker = reco_id
+
+    def write_segment(self, out_file):
+        """writes segment in kaldi segments format"""
+        print("{0} {1} {2} {3}".format(self.utt_id(), self.reco_id,
+                                       self.start_time, self.end_time),
+              file=out_file)
+
+    def write_utt2spk(self, out_file):
+        """writes speaker information in kaldi utt2spk format"""
+        print("{0} {1}".format(self.utt_id(), self.speaker),
+              file=out_file)
+
+    def write_text(self, out_file):
+        print("{0} {1}".format(self.utt_id(), self.text),
+              file=out_file)
+
+    def check(self):
+        """checks if this is a valid segment"""
+        assert self.end_time > self.start_time
+
+    def utt_id(self):
+        """returns the utterance id created from the recording id and
+        the timing information"""
+        return ("{spkr}-{0}-{1:06d}-{2:06d}".format(
+            self.reco_id, int(self.start_time * 100),
+            int(self.end_time * 100), spkr=self.speaker))
+
+    def duration(self):
+        """returns the duration of the segment"""
+        return self.end_time - self.start_time
+
+
+def process_segment_soup(reco_id, soup, split_at_sync=False):
+    """Processes the input segment soup into a list of objects of class
+    Segment.
+    If split_at_sync is False, then only a segment is created for the soup
+    without consideration to the sync tags.
+    """
+    start_time = float(soup['s_time'])
+    end_time = float(soup['e_time'])
+    speaker = soup['speaker']
+
+    segments = []
+
+    create_new_segment = True
+    for x in soup.children:
+        try:
+            if x.name == "sync":
+                assert not create_new_segment
+                if not split_at_sync:
+                    continue
+                start_time = float(x['time'])
+                segments[-1].end_time = start_time
+                create_new_segment = True
+            elif x.name == "background" or x.name == "comment":
+                continue
+            else:
+                if create_new_segment:
+                    assert split_at_sync or len(segments) == 0
+                    segment = Segment(reco_id, speaker)
+                    segment.text = x.encode('ascii').strip().replace('\n', ' ')
+                    segment.start_time = start_time
+                    segment.end_time = end_time
+                    if segment.duration() > 0:
+                        segments.append(segment)
+                    create_new_segment = False
+                else:
+                    segments[-1].text += (
+                        ' ' + x.encode('ascii').strip().replace('\n', ' '))
+        except Exception:
+            logger.error("Error processing element %s", x)
+            raise
+
+    return segments
+
+
+def process_transcription(transcription_file, segments_handle, utt2spk_handle,
+                          text_handle, split_at_sync=False):
+    """Processes transcription file into segments."""
+    doc = ''.join(open(transcription_file).readlines())
+    tag_matcher = re.compile(r"(<(Sync|Background)[^>]+>)")
+    doc_modified = tag_matcher.sub(r"\1</\2>", doc)
+
+    soup = BeautifulSoup(doc_modified, 'lxml')
+
+    reco_id, ext = os.path.splitext(os.path.basename(transcription_file))
+    reco_id = reco_id.strip('_')  # remove trailing underscores in the name
+
+    for episode in soup.find_all("episode"):
+        for section in episode.find_all("section"):
+            s_time = section['s_time']
+            e_time = section['e_time']
+            section_type = section['type']
+
+            logger.debug("Processing section st = %d, end = %d, "
+                         "type = %s", s_time, e_time, section_type)
+
+            for seg in section.find_all("segment"):
+                try:
+                    segments = process_segment_soup(
+                        reco_id, seg, split_at_sync=split_at_sync)
+                    for s in segments:
+                        if s.duration() == 0:
+                            continue
+                        s.write_segment(segments_handle)
+                        s.write_utt2spk(utt2spk_handle)
+                        s.write_text(text_handle)
+                except Exception:
+                    logger.error("Failed processing segment %s", seg)
+                    raise
+
+
+def _run(args):
+    if not os.path.isdir(args.dir):
+        os.makedirs(args.dir)
+
+    with open(os.path.join(args.dir, "wav.scp"), 'w') as wav_scp_handle:
+        for file_ in glob.glob("{0}/{1}/*.sph".format(args.audio_source_dir,
+                                                      "data")):
+            reco, ext = os.path.splitext(os.path.basename(file_))
+            reco = reco.strip('_')
+
+            print("{0} sox {1} -c 1 -r 16000 -t wav - |".format(
+                reco, file_), file=wav_scp_handle)
+
+    segments_handle = open(os.path.join(args.dir, "segments"), 'w')
+    utt2spk_handle = open(os.path.join(args.dir, "utt2spk"), 'w')
+    text_handle = open(os.path.join(args.dir, "text"), 'w')
+    for dir_ in glob.glob("{0}/{1}/*/".format(args.text_source_dir,
+                                              "hub4_eng_train_trans")):
+        for x in glob.glob("{0}/*.txt".format(dir_)):
+            try:
+                process_transcription(x, segments_handle, utt2spk_handle,
+                                      text_handle,
+                                      split_at_sync=args.split_at_sync)
+            except Exception:
+                logger.error("Failed to process file %s",
+                             x)
+                raise
+    segments_handle.close()
+    utt2spk_handle.close()
+    text_handle.close()
+
+
+def main():
+    try:
+        args = get_args()
+        _run(args)
+    except Exception:
+        raise
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh
new file mode 100755
index 00000000000..44138e2a228
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh
@@ -0,0 +1,51 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+. cmd.sh
+. path.sh
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <DIR>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC95T21 data/local/data/na_news"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+for x in $SOURCE_DIR/*/*/*; do
+  year=`basename $x`
+  newspaper=`basename $(dirname $x)`
+  d=$dir/${newspaper}_${year}
+
+  mkdir -p $d
+
+  list_file=$d/articles.list
+  ls $x/*.gz > $list_file
+  
+  mkdir -p $d/split$nj
+
+  eval utils/split_scp.pl $d/articles.list \
+    $d/split$nj/articles.list.{`seq -s, $nj`}
+
+  (
+  $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \
+    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB - \| \
+    gzip -c '>' $d/corpus.JOB.gz  || exit 1
+  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
+  rm $d/corpus.*.gz
+  ) &
+done
+
+wait
diff --git a/egs/bn/s5/local/data_prep/process_na_news_text.py b/egs/bn/s5/local/data_prep/process_na_news_text.py
new file mode 100755
index 00000000000..10941dd3186
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/process_na_news_text.py
@@ -0,0 +1,91 @@
+#! /usr/bin/env python
+
+from __future__ import print_function
+from bs4 import BeautifulSoup
+import argparse
+import gzip
+import logging
+import sys
+
+sys.path.insert(0, 'local/lm')
+import text_normalization
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+handler = logging.StreamHandler()
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    parser = argparse.ArgumentParser("Prepare NA News Text corpus (LDC95T21).")
+    parser.add_argument("file_list", type=argparse.FileType('r'),
+                        help="List of compressed source files for NA News Text. "
+                        "e.g: /export/corpora/LDC/LDC95T21/na_news_1/latwp/1994")
+    parser.add_argument("out_file", type=argparse.FileType('w'),
+                        help="Output file to write to.")
+
+    args = parser.parse_args()
+
+    return args
+
+
+def normalize_text(text):
+    text1 = text.strip()
+    text2 = text_normalization.remove_punctuations(text1)
+    text2 = text2.upper()
+    return text2
+
+
+def process_file(file_handle, out_file_handle):
+    doc = ' '.join(file_handle.readlines())
+    soup = BeautifulSoup(doc, 'lxml')
+
+    num_written = 0
+
+    for doc in soup.html.body.children:
+        try:
+            if doc.name != "doc":
+                continue
+            for para in doc.find_all('p'):
+                assert para.name == 'p'
+                text = ' '.join([unicode(x).strip() for x in para.contents])
+                normalized_text = normalize_text(text)
+                out_file_handle.write("{0}\n".format(
+                    normalized_text.encode('ascii')))
+                num_written += 1
+        except:
+            logger.error("Failed to process document %s", doc)
+            raise
+    if num_written == 0:
+        raise RuntimeError("0 sentences written.")
+
+
+def _run(args):
+        for line in args.file_list.readlines():
+            try:
+                file_ = line.strip()
+                with gzip.open(file_, 'r') as f:
+                    process_file(f, args.out_file)
+            except Exception:
+                logger.error("Failed processing file %s", file_)
+                raise
+
+
+def main():
+    try:
+        args = get_args()
+        _run(args)
+    except Exception:
+        raise
+    finally:
+        args.out_file.close()
+        args.file_list.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/bn/s5/local/dict b/egs/bn/s5/local/dict
new file mode 120000
index 00000000000..384304fdf2a
--- /dev/null
+++ b/egs/bn/s5/local/dict
@@ -0,0 +1 @@
+../../../wsj/s5/local/dict/
\ No newline at end of file
diff --git a/egs/bn/s5/local/format_data.sh b/egs/bn/s5/local/format_data.sh
new file mode 100755
index 00000000000..b7d58f83718
--- /dev/null
+++ b/egs/bn/s5/local/format_data.sh
@@ -0,0 +1,28 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh || exit 1;
+
+srcdir=data/local/data
+tmpdir=data/local/
+
+for t in train; do 
+  utils/fix_data_dir.sh $srcdir/$t
+  utils/copy_data_dir.sh $srcdir/$t data/$t
+  cat $srcdir/$t/text | \
+    local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" > \
+    data/$t/text
+  utils/fix_data_dir.sh data/$t
+done
+
+for t in eval98 eval98.pem; do
+  utils/copy_data_dir.sh $srcdir/$t data/$t
+  utils/fix_data_dir.sh data/$t
+done
+
+
diff --git a/egs/bn/s5/local/format_lms.sh b/egs/bn/s5/local/format_lms.sh
new file mode 100755
index 00000000000..7d9e3b82bfb
--- /dev/null
+++ b/egs/bn/s5/local/format_lms.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+#
+# Copyright  2014 Nickolay V. Shmyrev
+# Apache 2.0
+
+if [ -f path.sh ]; then . path.sh; fi
+
+set -e -o pipefail -u
+
+lang_suffix=_test
+
+. utils/parse_options.sh
+
+#arpa_lm=data/local/local_lm/data/arpa/4gram.arpa.gz
+small_arpa_lm=data/local/local_lm/data/arpa/4gram_small.arpa.gz
+big_arpa_lm=data/local/local_lm/data/arpa/4gram_big.arpa.gz
+
+for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+set -e
+
+cp -rT data/lang_nosp/ data/lang_nosp${lang_suffix}
+
+if [ -f data/lang_nosp${lang_suffix}/G.fst ] && [ data/lang_nosp${lang_suffix}/G.fst -nt $small_arpa_lm ]; then
+  echo "$0: not regenerating data/lang_nosp${lang_suffix}/G.fst as it already exists and "
+  echo ".. is newer than the source LM."
+else
+  arpa2fst --disambig-symbol=#0 --read-symbol-table=data/lang_nosp/words.txt \
+    "gunzip -c $small_arpa_lm|" data/lang_nosp${lang_suffix}/G.fst
+  echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
+  fstisstochastic data/lang_nosp${lang_suffix}/G.fst || true
+  utils/validate_lang.pl --skip-determinization-check data/lang_nosp${lang_suffix}
+fi
+
+
+if [ -f data/lang_nosp${lang_suffix}_rescore/G.carpa ] && [ data/lang_nosp${lang_suffix}_rescore/G.carpa -nt $big_arpa_lm ] && \
+    [ data/lang_nosp${lang_suffix}_rescore/G.carpa -nt data/lang_nosp/words.txt ]; then
+  echo "$0: not regenerating data/lang_nosp_rescore/ as it seems to already by up to date."
+else
+  utils/build_const_arpa_lm.sh $big_arpa_lm data/lang_nosp \
+    data/lang_nosp${lang_suffix}_rescore || exit 1;
+fi
+
+exit 0;
diff --git a/egs/bn/s5/local/lm/merge_word_counts.py b/egs/bn/s5/local/lm/merge_word_counts.py
new file mode 100755
index 00000000000..6338cbbf875
--- /dev/null
+++ b/egs/bn/s5/local/lm/merge_word_counts.py
@@ -0,0 +1,30 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This script merges pocolm word_counts and write a new word_counts file.
+A min-count argument is required to only write counts that are above the
+specified minimum count.
+"""
+
+import sys
+
+
+def main():
+    if len(sys.argv) != 2:
+        sys.stderr.write("Usage: {0} <min-count>\n".format(sys.argv[0]))
+        raise SystemExit(1)
+
+    words = {}
+    for line in sys.stdin.readlines():
+        parts = line.strip().split()
+        words[parts[1]] = words.get(parts[1], 0) + int(parts[0])
+
+    for word, count in words.iteritems():
+        if count >= int(sys.argv[1]):
+            print ("{0} {1}".format(count, word))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/bn/s5/local/lm/text_normalization.py b/egs/bn/s5/local/lm/text_normalization.py
new file mode 100644
index 00000000000..f74da60a6ef
--- /dev/null
+++ b/egs/bn/s5/local/lm/text_normalization.py
@@ -0,0 +1,42 @@
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This module contains methods for doing text normalization of broadcast news
+and similar text corpora.
+"""
+
+import re
+
+
+def normalize_bn_transcript(text, noise_word, spoken_noise_word):
+    """Normalize broadcast news transcript for audio."""
+    text.upper()
+    # Remove unclear speech markings
+    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
+    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
+    # Remove invented word markings
+    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
+    text = re.sub(r"\[[^]]+\]", noise_word, text)
+    text = re.sub(r"\{[^}]+\}", spoken_noise_word, text)
+    text = re.sub(r"\+([^+]+)\+", r"\1", text)
+
+    text1 = []
+    for word in text.split():
+        # Remove mispronunciation brackets
+        word = re.sub(r"^@(\w+)$", r"\1", word)
+        text1.append(word)
+    return " ".join(text1)
+
+
+def remove_punctuations(text):
+    """Remove punctuations and some other processing for text sentence."""
+    text1 = re.sub("\n", " ", text)
+    text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
+    text1 = re.sub(r"''|``|\(|\)", " ", text1)
+    text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
+    text1 = re.sub(r"\. ", " ", text1)
+    text1 = re.sub(r"([^0-9$-])\.([^0-9]|$)", r"\1\2", text1)
+    text1 = re.sub(r" - ", " ", text1)
+    text1 = re.sub(r"[ ]+", " ", text1)
+    return text1
diff --git a/egs/bn/s5/local/normalize_transcripts.pl b/egs/bn/s5/local/normalize_transcripts.pl
new file mode 100755
index 00000000000..cccf75def4a
--- /dev/null
+++ b/egs/bn/s5/local/normalize_transcripts.pl
@@ -0,0 +1,47 @@
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This takes data from the standard input that's unnormalized transcripts in the format
+# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] 
+# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] 
+# and outputs normalized transcripts.
+# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
+
+@ARGV == 2 ||  die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ tr:a-z:A-Z:;
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    foreach $w (split (" ",$trans)) {
+        $w =~ s:^[+](.+)[+]$:$1:;   # Remove mispronunciation brackets
+        $w =~ s:^@(.*)$:$1:;  # Remove best guesses for proper nouns
+        print " $w";
+    }
+    print "\n";
+}
+
diff --git a/egs/bn/s5/local/prepare_dict.sh b/egs/bn/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..441849329e1
--- /dev/null
+++ b/egs/bn/s5/local/prepare_dict.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# Copyright 2010-2012 Microsoft Corporation  
+#           2012-2014 Johns Hopkins University (Author: Daniel Povey)
+#                2015 Guoguo Chen
+#                2016 Vimal Manohar
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Call this script from one level above, e.g. from the s3/ directory.  It puts
+# its output in data/local/.
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+. path.sh
+. cmd.sh 
+
+set -e 
+set -o pipefail
+set -u
+
+# run this from ../
+dict_suffix=
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <wordlist>"
+  echo "e.g. : $0 data/local/local_lm/data/work/wordlist"
+  exit 1
+fi
+
+wordlist=$1
+
+dir=data/local/dict${dict_suffix}
+mkdir -p $dir
+
+if [ ! -d $dir/cmudict ]; then
+  # (1) Get the CMU dictionary
+  svn co  https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
+    $dir/cmudict || exit 1;
+fi
+
+# can add -r 10966 for strict compatibility.
+
+
+#(2) Dictionary preparation:
+
+
+# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
+# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
+
+# silence phones, one per line.
+(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
+echo SIL > $dir/optional_silence.txt
+
+# nonsilence phones; on each line is a list of phones that correspond
+# really to the same base phone.
+cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
+ perl -e 'while(<>){
+  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 
+  $phones_of{$1} .= "$_ "; }
+  foreach $list (values %phones_of) {print $list . "\n"; } ' \
+  > $dir/nonsilence_phones.txt || exit 1;
+
+# A few extra questions that will be added to those obtained by automatically clustering
+# the "real" phones.  These ask about stress; there's also one for silence.
+cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
+cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dir/extra_questions.txt || exit 1;
+
+grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
+ perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
+  > $dir/dict.cmu || exit 1;
+
+# Add to cmudict the silences, noises etc.
+
+(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<unk> SPN'; echo '<NOISE> NSN'; ) | \
+ cat - $dir/dict.cmu > $dir/lexicon2_raw.txt 
+awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist
+
+cat <<EOF >$dir/silence_phones.txt 
+SIL
+SPN
+NSN
+EOF
+
+if [ ! -f exp/g2p/.done ]; then
+  steps/dict/train_g2p.sh --cmd "$train_cmd" \
+    --silence-phones $dir/silence_phones.txt \
+    $dir/dict.cmu exp/g2p
+  touch exp/g2p/.done
+fi
+
+cat $wordlist | python -c '
+import sys
+
+words = {}
+for line in open(sys.argv[1]).readlines():
+  words[line.strip()] = 1
+
+oovs = {}
+for line in sys.stdin.readlines():
+  word = line.strip()
+  if word not in words:
+    oovs[word] = 1
+
+for oov in oovs:
+  print (oov)' $dir/orig_wordlist | sort -u > $dir/oovlist
+
+export PATH=$PATH:`pwd`/local/dict
+
+cat $dir/oovlist | get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms
+
+mkdir -p $dir/f $dir/b # forward, backward directions of rules...
+  # forward is normal suffix
+  # rules, backward is reversed (prefix rules).  These
+  # dirs contain stuff we create while making the rule-based
+  # extensions to the dictionary.
+
+# Remove ; and , from words, if they are present; these
+# might crash our scripts, as they are used as separators there.
+filter_dict.pl $dir/dict.cmu > $dir/f/dict
+cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
+reverse_dict.pl $dir/f/dict > $dir/b/dict
+reverse_dict.pl $dir/f/oovs > $dir/b/oovs
+
+# The next stage takes a few minutes.
+# Note: the forward stage takes longer, as English is
+# mostly a suffix-based language, and there are more rules
+# that it finds.
+for d in $dir/f $dir/b; do
+ (
+   cd $d
+   cat dict | get_rules.pl 2>get_rules.log >rules
+   get_rule_hierarchy.pl rules >hierarchy
+   awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
+     limit_candidate_prons.pl hierarchy | \
+     score_prons.pl dict | \
+     count_rules.pl >rule.counts
+   # the sort command below is just for convenience of reading.
+   score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
+   get_candidate_prons.pl rules.with_scores dict oovs | \
+     limit_candidate_prons.pl hierarchy > oovs.candidates
+ ) &
+done
+wait
+
+# Merge the candidates.
+reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
+select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
+  > $dir/dict.oovs
+
+cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
+awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
+sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
+
+steps/dict/apply_g2p.sh --cmd "$train_cmd" \
+  $dir/oovlist.not_handled exp/g2p exp/g2p/oov_lex
+cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \
+  $dir/dict.oovs_g2p
+
+# the sort | uniq is to remove a duplicated pron from cmudict.
+cat $dir/lexicon2_raw.txt $dir/dict.oovs_merged $dir/dict.oovs_g2p | sort | uniq > \
+  $dir/lexicon.txt || exit 1;
+# lexicon.txt is without the _B, _E, _S, _I markers.
+
+rm $dir/lexiconp.txt 2>/dev/null || true
+
+echo "Dictionary preparation succeeded"
+
+
diff --git a/egs/bn/s5/local/run_cleanup_segmentation.sh b/egs/bn/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..0927b9f9a7d
--- /dev/null
+++ b/egs/bn/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri3
+nj=100
+decode_nj=16
+decode_num_threads=4
+
+. ./path.sh
+. ./cmd.sh
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
+    $data data/lang_nosp $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    4200 40000 $cleaned_data data/lang_nosp ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+if [ $stage -le 4 ]; then
+  # Test with the models trained on cleaned-up data.
+  utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
+
+  for dset in eval98.pem; do
+    steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+       --cmd "$decode_cmd"  --num-threads 4 \
+       ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \
+       data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp ${cleaned_dir} ${cleaned_dir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 6 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang_nosp \
+    ${cleaned_dir}_ali_${cleanup_affix} exp/tri4b_${cleanup_affix}
+fi
+
+cleaned_dir=exp/tri4b_${cleanup_affix}
+if [ $stage -le 7 ]; then
+  # Test with the models trained on cleaned-up data.
+  utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
+
+  for dset in eval98.pem; do
+    steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+       --cmd "$decode_cmd"  --num-threads 4 \
+       ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \
+       data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore
+  done
+fi
diff --git a/egs/bn/s5/local/score.sh b/egs/bn/s5/local/score.sh
new file mode 120000
index 00000000000..d89286dc25a
--- /dev/null
+++ b/egs/bn/s5/local/score.sh
@@ -0,0 +1 @@
+score_sclite.sh
\ No newline at end of file
diff --git a/egs/bn/s5/local/score_sclite.sh b/egs/bn/s5/local/score_sclite.sh
new file mode 100755
index 00000000000..20045c2e96b
--- /dev/null
+++ b/egs/bn/s5/local/score_sclite.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+min_lmwt=5
+max_lmwt=17
+iter=final
+word_ins_penalty=0.0,0.5,1.0
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
+
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
+[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
+hubdir=`dirname $hubscr`
+
+for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
+     $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
+name=`basename $data`; # e.g. eval2000
+
+mkdir -p $dir/scoring/log
+
+if [ $stage -le 0 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
+      mkdir -p $dir/score_LMWT_${wip}/ '&&' \
+      lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+      lattice-1best ark:- ark:- \| \
+      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+      nbest-to-ctm $frame_shift_opt ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt  \| \
+      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+      '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  # Remove some stuff we don't want to score, from the ctm.
+  # the big expression in parentheses contains all the things that get mapped
+  # by the glm file, into hesitations.
+  # The -$ expression removes partial words.
+  # the aim here is to remove all the things that appear in the reference as optionally
+  # deletable (inside parentheses), as if we delete these there is no loss, while
+  # if we get them correct there is no gain.
+  for x in $dir/score_*/$name.ctm; do
+    cp $x $dir/tmpf;
+    cat $dir/tmpf | grep -i -v -E '<NOISE|SPOKEN_NOISE>' | \
+    grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW)$' | \
+    grep -v -- '-$' > $x;
+  done
+fi
+
+# Score the set...
+if [ $stage -le 2 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \
+      cp $data/stm $dir/score_LMWT_${wip}/ '&&' \
+      $hubscr -p $hubdir -V -l english -h hub4 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm || exit 1;
+  done
+fi
+
+exit 0
diff --git a/egs/bn/s5/local/train_lm.sh b/egs/bn/s5/local/train_lm.sh
new file mode 100755
index 00000000000..d8523ca30f4
--- /dev/null
+++ b/egs/bn/s5/local/train_lm.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+#
+# This script trains a LM on the Broadcast News transcripts.
+# It is based on the example scripts distributed with PocoLM.
+
+# It will first check if pocolm is installed and if not will process with installation
+
+
+set -e
+set -o pipefail 
+set -u
+
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+lm_dir=${dir}/data
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+num_dev_sentences=5000
+RANDOM=0
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  cat data/train/text | shuf > ${dir}/train_text
+  head -n $num_dev_sentences < ${dir}/train_text | cut -d ' ' -f 2- > ${dir}/data/text/dev.txt 
+  tail -n +$[num_dev_sentences+1] < ${dir}/train_text | cut -d ' ' -f 2- > ${dir}/data/text/bn.txt
+
+  for x in data/local/data/na_news/*; do
+    y=`basename $x`
+    [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
+  done
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (a subset of the training data is used as ${dir}/data/text/ted.txt to work
+  # out interpolation weights.
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cat data/eval98/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \
+    local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" | \
+    cut -d ' ' -f 2- > ${dir}/data/real_dev_set.txt
+fi
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir/data/work
+  if [ ! -f $dir/data/work/word_counts/.done ]; then
+    get_word_counts.py $dir/data/text $dir/data/work/word_counts
+    touch $dir/data/work/word_counts/.done
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  for x in data/local/data/na_news/*; do
+    y=$dir/data/work/word_counts/`basename $x`.counts
+    [ -f $y ] && cat $y 
+  done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts
+
+  cat $dir/data/work/word_counts/{bn,dev}.counts | \
+    local/lm/merge_word_counts.py 2 > $dir/data/work/bn.wordlist_counts
+
+  cat $dir/data/work/na_news.wordlist_counts $dir/data/work/bn.wordlist_counts | \
+    perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[1]\n"; }' | \
+    sort -u > $dir/data/work/wordlist
+fi
+
+order=4
+wordlist=$dir/data/work/wordlist
+
+min_counts='default=5 bn=1'
+
+lm_name="`basename ${wordlist}`_${order}"
+if [ -n "${min_counts}" ]; then
+  lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+fi
+unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+
+if [ $stage -le 3 ]; then
+  # decide on the vocabulary.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  train_lm.py  --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20  \
+               --limit-unk-history=true \
+               --fold-dev-into=bn \
+               --min-counts="${min_counts}" \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  #[perplexity = 157.87] over 18290.0 words
+  
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram.arpa.gz
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+
+  # current results, after adding --limit-unk-history=true:
+  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words.
+
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+
+  # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst):
+  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words.
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
+
diff --git a/egs/bn/s5/path.sh b/egs/bn/s5/path.sh
new file mode 100755
index 00000000000..da29adb7b2a
--- /dev/null
+++ b/egs/bn/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+. $KALDI_ROOT/tools/env.sh
+export LC_ALL=C
diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh
new file mode 100755
index 00000000000..24c47cb90ba
--- /dev/null
+++ b/egs/bn/s5/run.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Copyright 2016   Vimal Manohar
+# Apache 2.0.
+
+# See README.txt for more info on data required.
+
+. cmd.sh
+. path.sh
+
+set -o pipefail
+
+mfccdir=`pwd`/mfcc
+nj=40
+
+local/data_prep/prepare_bn_data.py --split-at-sync=false \
+  /export/corpora5/LDC/LDC97S44 \
+  /export/corpora/LDC/LDC97T22 data/local/data/train
+
+local/data_prep/prepare_na_news_test_corpus.sh --nj 40 --cmd "$train_cmd" \
+  /export/corpora/LDC/LDC95T21 data/local/data/na_news
+
+local/data_prep/prepare_1996_csr_hub4_corpus.sh --nj 10 --cmd "$train_cmd" \
+  /export/corpora/LDC/LDC98T31 data/local/data/csr96_hub4
+
+local/prepare_1998_hub4_bn_eng_eval.sh /export/corpora/LDC/LDC2000S86/ \
+  data/local/data/eval98
+
+local/format_data.sh 
+
+local/train_lm.sh 
+
+local/prepare_dict.sh --dict-suffix "_nosp" \
+  data/local/local_lm/data/work/wordlist
+
+utils/prepare_lang.sh data/local/dict_nosp \
+  "<unk>" data/local/lang_tmp_nosp data/lang_nosp
+
+local/format_lms.sh 
+
+for x in train eval98 eval98.pem; do 
+  this_nj=$(cat data/$x/utt2spk | wc -l)
+  if [ $this_nj -gt 30 ]; then
+    this_nj=30
+  fi
+
+  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj $this_nj \
+    --cmd "$train_cmd" \
+    data/$x exp/make_mfcc $mfccdir
+  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir
+  utils/fix_data_dir.sh data/$x
+done
+
+utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
+utils/subset_data_dir.sh data/train 2000 data/train_2k
+
+# Note: the --boost-silence option should probably be omitted by default
+# for normal setups.  It doesn't always help. [it's to discourage non-silence
+# models from modeling silence.]
+steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_1kshort data/lang_nosp exp/mono0a
+
+steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_2k data/lang_nosp exp/mono0a exp/mono0a_ali
+
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
+  data/train_2k data/lang_nosp exp/mono0a_ali exp/tri1
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train data/lang_nosp exp/tri1 exp/tri1_ali
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 15000 \
+  data/train data/lang_nosp exp/tri1_ali exp/tri2
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train data/lang_nosp exp/tri2 exp/tri2_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train data/lang_nosp exp/tri2_ali exp/tri3
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+
+steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
+  exp/tri3/graph_nosp data/eval98.pem exp/tri3/decode_nosp_eval98.pem
+steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+  data/lang_nosp_test data/lang_nosp_test_rescore \
+  data/eval98.pem exp/tri3/decode_nosp_eval98.pem \
+  exp/tri3/decode_rescore_nosp_eval98.pem
+
+exit 0
diff --git a/egs/bn/s5/steps b/egs/bn/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/bn/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/bn/s5/utils b/egs/bn/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/bn/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file

From 6e73dec63bd19363b9e633d884ff9917b4b3e932 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 6 Jan 2017 18:57:47 -0500
Subject: [PATCH 04/38] bn: Add 1999 BN eval preparation

---
 egs/bn/s5/local/data_prep/hub4_utils.py       | 68 +++++++++++++++++++
 .../prepare_1998_hub4_bn_eng_eval.sh          | 52 ++++----------
 .../prepare_1999_hub4_bn_eng_eval.sh          | 66 ++++++++++++++++++
 3 files changed, 148 insertions(+), 38 deletions(-)
 create mode 100644 egs/bn/s5/local/data_prep/hub4_utils.py
 create mode 100644 egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh

diff --git a/egs/bn/s5/local/data_prep/hub4_utils.py b/egs/bn/s5/local/data_prep/hub4_utils.py
new file mode 100644
index 00000000000..a5f11f67c31
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/hub4_utils.py
@@ -0,0 +1,68 @@
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This module contains utilities for preparing the HUB4 broadcast news
+evaluation corpora.
+"""
+
+import sys
+import os
+
+
+def parse_uem_line(reco, line):
+    """This method parses a 'line' from the UEM for recording 'reco'
+    and returns the line converted to kaldi segments format.
+    The format of UEM is
+    <file-id> <channel> <start-time> <end-time>
+
+    We force the channel to be 1 and take the file-id to be the recording-id.
+    """
+    line = line.strip()
+    if len(line) == 0 or line[0:2] == ";;":
+        continue
+    parts = line.split()
+
+    # The channel ID is expected to be 1.
+    if parts[1] != "1":
+        raise TypeError("Invalid line {0}".format(line))
+
+    start_time = float(parts[2])
+    end_time = float(parts[3])
+
+    utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
+                                       int(end_time * 100))
+    return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
+
+
+def parse_cmu_seg_line(reco, line):
+    """This line parses a 'line' from the CMU automatic segmentation for
+    recording 'reco'.
+    The CMU segmentation has the following format:
+    <file> <channel> <speaker|environment> <start-time> <end-time> <condition>
+
+    We force the channel to be 1 and take the file-id to be the recording-id.
+    """
+    line = line.strip()
+    if len(line) == 0 or line[0:2] == ";;":
+        continue
+    parts = line.split()
+
+    # Actually a file, but we assuming 1-1 mapping to recording and force
+    # channel to be 1.
+    reco = parts[0]
+
+    # The channel ID is expected to be 1.
+    if parts[1] != "1":
+        raise TypeError("Invalid line {0}".format(line))
+    spk = parts[2]
+    start_time = float(parts[3])
+    end_time = float(parts[4])
+
+    utt = "{spk}-{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
+                                             int(end_time * 100), spk=spk)
+
+    segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
+        utt, reco, st=start_time, end=end_time)
+    utt2spk_line = "{0} {1}".format(utt, spk)
+
+    return (segment_line, utt2spk_line)
diff --git a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
index a167c2cfee0..f990adbd74a 100755
--- a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
+++ b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
@@ -5,7 +5,7 @@
 
 if [ $# -ne 2 ]; then
   echo "Usage: $0 <SOURCE-DIR> <dir>"
-  echo "local/prepare_1998_hub4_bn_eng_eval.sh /export/corpora/LDC/LDC2000S86/ data/local/data/eval98"
+  echo "$0 /export/corpora/LDC/LDC2000S86/ data/local/data/eval98"
   exit 1
 fi
 
@@ -14,55 +14,31 @@ dir=$2
 
 mkdir -p $dir
 
+if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then
+  echo "$0: Invalid SOURCE-DIR for LDC2000S86 corpus"
+  exit 1
+fi
+
 for uem in $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.uem; do
   python -c '
 import sys, os
+import hub4_utils
 uem = sys.argv[1]
 reco, ext = os.path.splitext(os.path.basename(uem))
 for line in open(uem).readlines():
   line = line.strip()
-  if len(line) == 0 or line[0:2] == ";;":
-    continue
-  parts = line.split()
-
-  assert parts[1] == "1"
-  start_time = float(parts[2])
-  end_time = float(parts[3])
-  
-  utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), 
-                                     int(end_time * 100))
-  print ("{0} {1} {2} {3}".format(utt, reco, start_time, end_time))' $uem 
+  print (parse_uem_line(line))' $uem
 done > $dir/segments
 
 cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \
   python -c '
-from __future__ import print_function
 import sys
-
-segments_handle = open(sys.argv[1], "w")
-utt2spk_handle = open(sys.argv[2], "w")
-for line in sys.stdin.readlines():
-  line = line.strip()
-  if len(line) == 0 or line[0:2] == ";;":
-    continue
-  parts = line.split()
-
-  reco = parts[0] 
-  assert parts[1] == "1"
-  spk = parts[2]
-  start_time = float(parts[3])
-  end_time = float(parts[4])
-  
-  utt = "{spk}-{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), 
-                                           int(end_time * 100), spk=spk)
-
-  print ("{0} {1} {2} {3}".format(utt, reco, start_time, end_time),
-         file=segments_handle)
-  print ("{0} {1}".format(utt, spk),
-         file=utt2spk_handle)
-segments_handle.close()
-utt2spk_handle.close()
-' $dir/segments.pem $dir/utt2spk.pem
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    segments_line, utt2spk_line = parse_cmu_seg_line(reco, line)
+    s_f.write("{0}\n".format(segments_line))
+    u_f.write("{0}\n".format(utt2spk_line))' \
+      $dir/segments.pem $dir/utt2spk.pem
  
 export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
 sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
diff --git a/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
new file mode 100644
index 00000000000..133b56b5b36
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
@@ -0,0 +1,66 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora5/LDC/LDC2000S88/ data/local/data/eval99"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+if [ ! -d $SOURCE_DIR/hub4_1999/ ]; then
+  echo "$0: Invalid SOURCE-DIR for LDC2000S88 corpus"
+  exit 1
+fi
+
+for uem in $SOURCE_DIR/hub4_1999/bnews_99/bn99en_{1,2}.uem; do
+  python -c '
+import sys, os
+import hub4_utils
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  print (parse_uem_line(line))' $uem
+done > $dir/segments
+
+awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
+
+cat $SOURCE_DIR/hub4_1999/bnews_99/bn99en_{1,2}.seg | \
+  python -c '
+import sys
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    segments_line, utt2spk_line = parse_cmu_seg_line(reco, line)
+    s_f.write("{0}\n".format(segments_line))
+    u_f.write("{0}\n".format(utt2spk_line))' \
+      $dir/segments.pem $dir/utt2spk.pem
+
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/wav.scp
+
+awk '{print $1" "$1" 1"}' $dir/wav.scp > $dir/reco2file_and_channel
+
+cp $SOURCE_DIR/hub4_1999/bnews99/en981118.glm $dir/en981118.glm
+cp $SOURCE_DIR/hub4_1999/bnews99/bn99en_1.stm $dir/bn99en_1.stm
+
+cp $SOURCE_DIR/hub4_1999/bnews99/en991231.glm $dir/en991231.glm
+cp $SOURCE_DIR/hub4_1999/bnews99/bn99en_2.stm $dir/bn99en_2.stm
+
+utils/fix_data_dir.sh $dir
+utils/copy_data_dir.sh $dir ${dir}.pem
+cp $dir/*.stm ${dir}.pem/
+
+cp $dir/segments.pem ${dir}.pem/segments
+cp $dir/utt2spk.pem ${dir}.pem/utt2spk
+utils/fix_data_dir.sh ${dir}.pem

From 917a67059070f1c4e35b295830583c9bd36ada54 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 10 Jan 2017 20:22:00 -0500
Subject: [PATCH 05/38] bn: Add more data preparation scripts

---
 egs/bn/s5/local/data_prep/hub4_utils.py       | 120 +++++++-
 .../data_prep/prepare_1995_csr_hub4_corpus.sh |  59 ++++
 ...are_bn_data.py => prepare_1996_bn_data.py} |  70 +++--
 ....sh => prepare_1996_csr_hub4_lm_corpus.sh} |  19 +-
 .../prepare_1996_hub4_bn_eng_dev_and_eval.sh  |  99 +++++++
 .../local/data_prep/prepare_1997_bn_data.py   |   2 +
 .../prepare_1997_hub4_bn_eng_eval.sh          |  64 ++++
 .../prepare_1998_hub4_bn_eng_eval.sh          |  18 +-
 .../prepare_1999_hub4_bn_eng_eval.sh          |  74 ++---
 .../data_prep/prepare_na_news_text_corpus.sh  |   3 +
 .../prepare_na_news_text_supplement.sh        |  61 ++++
 .../data_prep/process_1995_bn_annotation.py   | 273 ++++++++++++++++++
 .../local/data_prep/process_na_news_text.py   |  37 ++-
 13 files changed, 816 insertions(+), 83 deletions(-)
 create mode 100755 egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
 rename egs/bn/s5/local/data_prep/{prepare_bn_data.py => prepare_1996_bn_data.py} (74%)
 rename egs/bn/s5/local/data_prep/{prepare_1996_csr_hub4_corpus.sh => prepare_1996_csr_hub4_lm_corpus.sh} (59%)
 create mode 100755 egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
 create mode 100755 egs/bn/s5/local/data_prep/prepare_1997_bn_data.py
 create mode 100755 egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
 mode change 100644 => 100755 egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
 create mode 100644 egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh
 create mode 100755 egs/bn/s5/local/data_prep/process_1995_bn_annotation.py

diff --git a/egs/bn/s5/local/data_prep/hub4_utils.py b/egs/bn/s5/local/data_prep/hub4_utils.py
index a5f11f67c31..b43de80c73b 100644
--- a/egs/bn/s5/local/data_prep/hub4_utils.py
+++ b/egs/bn/s5/local/data_prep/hub4_utils.py
@@ -5,8 +5,9 @@
 evaluation corpora.
 """
 
-import sys
 import os
+import re
+import sys
 
 
 def parse_uem_line(reco, line):
@@ -19,9 +20,12 @@ def parse_uem_line(reco, line):
     """
     line = line.strip()
     if len(line) == 0 or line[0:2] == ";;":
-        continue
+        return None
     parts = line.split()
 
+    if reco is None:
+        reco = parts[0]
+
     # The channel ID is expected to be 1.
     if parts[1] != "1":
         raise TypeError("Invalid line {0}".format(line))
@@ -34,17 +38,17 @@ def parse_uem_line(reco, line):
     return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
 
 
-def parse_cmu_seg_line(reco, line):
+def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
     """This line parses a 'line' from the CMU automatic segmentation for
-    recording 'reco'.
+    recording.
     The CMU segmentation has the following format:
-    <file> <channel> <speaker|environment> <start-time> <end-time> <condition>
+    <file> <channel> <speaker> <start-time> <end-time> <condition>
 
     We force the channel to be 1 and take the file-id to be the recording-id.
     """
     line = line.strip()
     if len(line) == 0 or line[0:2] == ";;":
-        continue
+        return None
     parts = line.split()
 
     # Actually a file, but we assuming 1-1 mapping to recording and force
@@ -55,14 +59,116 @@ def parse_cmu_seg_line(reco, line):
     if parts[1] != "1":
         raise TypeError("Invalid line {0}".format(line))
     spk = parts[2]
+
     start_time = float(parts[3])
     end_time = float(parts[4])
 
-    utt = "{spk}-{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
+    if prepend_reco_to_spk:
+        spk = reco + '-' + spk
+        utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
                                              int(end_time * 100), spk=spk)
+    else:
+        utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
+                                                    int(end_time * 100),
+                                                    reco=reco, spk=spk)
 
     segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
         utt, reco, st=start_time, end=end_time)
     utt2spk_line = "{0} {1}".format(utt, spk)
 
     return (segment_line, utt2spk_line)
+
+
+def normalize_bn_transcript(text, noise_word, spoken_noise_word):
+    """Normalize broadcast news transcript for audio."""
+    text = text.upper()
+    # Remove unclear speech markings
+    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
+    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
+    # Remove invented word markings
+    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
+    text = re.sub(r"\[[^]]+\]", noise_word, text)
+    text = re.sub(r"\{[^}]+\}", spoken_noise_word, text)
+    # Remove mispronunciation brackets
+    text = re.sub(r"\+([^+]+)\+", r"\1", text)
+
+    text1 = []
+    for word in text.split():
+        # Remove best guesses for proper nouns
+        word = re.sub(r"^@(\w+)$", r"\1", word)
+        text1.append(word)
+    return " ".join(text1)
+
+
+def normalize_csr_transcript(text, noise_word, spoken_noise_word):
+    """Normalize broadcast news transcript for audio."""
+    text = text.upper()
+
+    # Remove long event markings
+    text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
+    # Remove comments
+    text = re.sub(r"\{\{[^}]*\}\}", "", text)
+    # Replace alternative words with a single one (second alternative)
+    text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
+    # Remove partial word completions
+    text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
+    # Remove accent marks and diacritics
+    text = re.sub(r"\\[3-8]", "", text)
+
+    # Remove unclear speech markings
+    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
+    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
+    # Remove invented word markings
+    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
+    # Replace speaker-made noises with <SPOKEN_NOISE>
+    text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
+                  spoken_noise_word, text)
+    # Replace noise with <NOISE>
+    text = re.sub(r"\[[^]]+\]", noise_word, text)
+    text = re.sub(r"\+([^+]+)\+", r"\1", text)
+
+    # Remove periods after letter.
+    text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
+    # Replace \. with .
+    text = re.sub(r"\\.", r".", text)
+
+    text1 = []
+    for word in text.split():
+        if word == spoken_noise_word or word == noise_word:
+            text1.append(word)
+            continue
+
+        # Remove mispronunciation brackets
+        word = re.sub(r"^@(\w+)$", r"\1", word)
+        # Remove everything other than the standard ASCII symbols
+        word = re.sub("[^A-Za-z0-9.' _-]", "", word)
+        text1.append(word)
+    return " ".join(text1)
+
+
+def remove_punctuations(text):
+    """Remove punctuations and some other processing for text sentence."""
+    # Remove HTML new lines that are not end of sentences
+    text1 = re.sub("\n", " ", text)
+
+    # Remove some markers like double dash that are normally used to separate
+    # name titles in newspapers.
+    text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
+
+    # Remove quotation marks
+    text1 = re.sub(r"''|``|\(|\)", " ", text1)
+
+    # Remove everything other than the standard ASCII symbols
+    text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
+
+    # Replace multiple .'s with single and then remove isolated '.'
+    text1 = re.sub(r"\.[.]+ ", ".", text1)
+    text1 = re.sub(r" \. ", " ", text1)
+
+    # Remove isolated '-'
+    text1 = re.sub(r" - ", " ", text1)
+
+    # Replace multiple spaces with single.
+    text1 = re.sub(r"[ ]+", " ", text1)
+
+    return text1
diff --git a/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh b/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
new file mode 100755
index 00000000000..b199fdc8a48
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
@@ -0,0 +1,59 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the 1995 CSR-IV HUB4 corpus
+# https://catalog.ldc.upenn.edu/LDC96S31
+
+set -e
+set -o pipefail
+set -u
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo " e.g.: $0 /export/corpora5/LDC/LDC96S31/csr95_hub4 data/local/data/csr95_hub4"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+for d in $SOURCE_DIR/csr95/h4/devtst $SOURCE_DIR/csr95/h4/evltst \
+  $SOURCE_DIR/csr95/h4/train; do
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC96S31 corpus"
+    exit 1
+  fi
+done
+
+mkdir -p $dir
+
+for x in `ls $SOURCE_DIR/csr95/h4/*/*.wav`; do
+  y=`basename $x`
+  z=${y%.wav}
+  echo "$z $x"
+done > $dir/wav_scp
+
+cat $dir/wav_scp | grep "csr95/h4/train" > $dir/train95_wav_scp
+cat $dir/wav_scp | grep "csr95/h4/devtst" > $dir/dev95_wav_scp
+cat $dir/wav_scp | grep "csr95/h4/evltst" > $dir/eval95_wav_scp
+
+rm $dir/*_{segments,utt2spk,text} || true
+
+for x in `ls $SOURCE_DIR/csr95/h4/*/*.txt`; do
+  if [[ $x =~ "csr95/h4/train" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/train95_segments $dir/train95_utt2spk $dir/train95_text
+  fi
+  
+  if [[ $x =~ "csr95/h4/devtst" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/dev95_segments $dir/dev95_utt2spk $dir/dev95_text
+  fi
+  
+  if [[ $x =~ "csr95/h4/evltst" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/eval95_segments $dir/eval95_utt2spk $dir/eval95_text
+  fi
+done
diff --git a/egs/bn/s5/local/data_prep/prepare_bn_data.py b/egs/bn/s5/local/data_prep/prepare_1996_bn_data.py
similarity index 74%
rename from egs/bn/s5/local/data_prep/prepare_bn_data.py
rename to egs/bn/s5/local/data_prep/prepare_1996_bn_data.py
index b96d0503367..26bc69f572b 100755
--- a/egs/bn/s5/local/data_prep/prepare_bn_data.py
+++ b/egs/bn/s5/local/data_prep/prepare_1996_bn_data.py
@@ -1,5 +1,13 @@
 #! /usr/bin/env python
 
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This script prepares the 1996 English Broadcast News (HUB4) corpus.
+https://catalog.ldc.upenn.edu/LDC97S44
+https://catalog.ldc.upenn.edu/LDC97T22
+"""
+
 from __future__ import print_function
 import argparse
 import glob
@@ -7,6 +15,7 @@
 import os
 import re
 from bs4 import BeautifulSoup
+import hub4_utils
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -20,6 +29,13 @@
 
 def get_args():
     parser = argparse.ArgumentParser("Prepare BN corpus.")
+    parser.add_argument("--noise-word", type=str, default="<NOISE>",
+                        help="""Replace all noise words in transcript
+                        with this noise_word""")
+    parser.add_argument("--spoken-noise-word", type=str,
+                        default="<SPOKEN_NOISE>",
+                        help="""Replace all speaker noise words in transcript
+                        with this spoken_noise_word""")
     parser.add_argument("--split-at-sync", type=str,
                         choices=["true", "false"], default="false",
                         help="If true, creates separate segments split "
@@ -48,36 +64,49 @@ def __init__(self, reco_id, speaker=None):
         self.text = None
         self.start_time = -1
         self.end_time = -1
-        if speaker is not None:
-            self.speaker = speaker
-        else:
-            self.speaker = reco_id
+        self.speaker = speaker
 
     def write_segment(self, out_file):
         """writes segment in kaldi segments format"""
-        print("{0} {1} {2} {3}".format(self.utt_id(), self.reco_id,
+        print("{0} {1} {2} {3}".format(self.get_utt_id(), self.reco_id,
                                        self.start_time, self.end_time),
               file=out_file)
 
     def write_utt2spk(self, out_file):
         """writes speaker information in kaldi utt2spk format"""
-        print("{0} {1}".format(self.utt_id(), self.speaker),
+        print("{0} {1}".format(self.get_utt_id(), self.get_spk_id()),
               file=out_file)
 
-    def write_text(self, out_file):
-        print("{0} {1}".format(self.utt_id(), self.text),
-              file=out_file)
+    def write_text(self, out_file, noise_word="<NOISE>",
+                   spoken_noise_word="<SPOKEN_NOISE_WORD>"):
+        text = hub4_utils.normalize_bn_transcript(
+            self.text, noise_word, spoken_noise_word)
+        if len(text) == 0 or re.match(r"^\s*$", text):
+            return
+        print("{0} {1}".format(self.get_utt_id(), text), file=out_file)
 
     def check(self):
         """checks if this is a valid segment"""
         assert self.end_time > self.start_time
 
-    def utt_id(self):
+    def get_utt_id(self):
         """returns the utterance id created from the recording id and
         the timing information"""
-        return ("{spkr}-{0}-{1:06d}-{2:06d}".format(
-            self.reco_id, int(self.start_time * 100),
-            int(self.end_time * 100), spkr=self.speaker))
+        if self.speaker is None:
+            return ("{0}-{1:06d}-{2:06d}".format(
+                self.reco_id, int(self.start_time * 100),
+                int(self.end_time * 100)))
+        else:
+            return ("{0}-{1:06d}-{2:06d}".format(
+                self.get_spk_id(), int(self.start_time * 100),
+                int(self.end_time * 100)))
+
+    def get_spk_id(self):
+        if self.speaker is None:
+            return ("{0}-{1:06d}-{2:06d}".format(
+                self.reco_id, int(self.start_time * 100),
+                int(self.end_time * 100)))
+        return "{0}-{1}".format(self.reco_id, self.speaker)
 
     def duration(self):
         """returns the duration of the segment"""
@@ -129,7 +158,9 @@ def process_segment_soup(reco_id, soup, split_at_sync=False):
 
 
 def process_transcription(transcription_file, segments_handle, utt2spk_handle,
-                          text_handle, split_at_sync=False):
+                          text_handle, split_at_sync=False,
+                          noise_word="<NOISE>",
+                          spoken_noise_word="<SPOKEN_NOISE>"):
     """Processes transcription file into segments."""
     doc = ''.join(open(transcription_file).readlines())
     tag_matcher = re.compile(r"(<(Sync|Background)[^>]+>)")
@@ -158,13 +189,14 @@ def process_transcription(transcription_file, segments_handle, utt2spk_handle,
                             continue
                         s.write_segment(segments_handle)
                         s.write_utt2spk(utt2spk_handle)
-                        s.write_text(text_handle)
+                        s.write_text(text_handle, noise_word,
+                                     spoken_noise_word)
                 except Exception:
                     logger.error("Failed processing segment %s", seg)
                     raise
 
 
-def _run(args):
+def run(args):
     if not os.path.isdir(args.dir):
         os.makedirs(args.dir)
 
@@ -186,7 +218,9 @@ def _run(args):
             try:
                 process_transcription(x, segments_handle, utt2spk_handle,
                                       text_handle,
-                                      split_at_sync=args.split_at_sync)
+                                      split_at_sync=args.split_at_sync,
+                                      noise_word=args.noise_word,
+                                      spoken_noise_word=args.spoken_noise_word)
             except Exception:
                 logger.error("Failed to process file %s",
                              x)
@@ -199,7 +233,7 @@ def _run(args):
 def main():
     try:
         args = get_args()
-        _run(args)
+        run(args)
     except Exception:
         raise
 
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
similarity index 59%
rename from egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh
rename to egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
index fc20758eec0..444a491c7b8 100755
--- a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh
+++ b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
@@ -3,6 +3,9 @@
 # Copyright 2016  Vimal Manohar
 # Apache 2.0.
 
+# This script prepares the 1996 CSR HUB4 Language Model corpus
+# https://catalog.ldc.upenn.edu/LDC98T31
+
 set -e
 set -o pipefail
 set -u
@@ -16,7 +19,7 @@ stage=0
 
 if [ $# -ne 2 ]; then
   echo "Usage: $0 <SOURCE-DIR> <dir>"
-  echo " e.g.: $0 /export/corpora/LDC/LDC98T31/ data/local/data/csr96_hub4"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T31/1996_csr_hub4_model data/local/data/csr96_hub4"
   exit 1
 fi
 
@@ -25,9 +28,13 @@ dir=$2
 
 mkdir -p $dir
 
-ls $SOURCE_DIR/1996_csr_hub4_model/st_train/*.stZ \
-  $SOURCE_DIR/1996_csr_hub4_model/st_test/*.stZ | sort > \
-  $dir/filelist
+for d in $SOURCE_DIR/st_train/ $SOURCE_DIR/st_test/; do
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC98T31 corpus"
+    exit 1
+  fi
+  ls $d/*.stZ 
+done | sort > $dir/filelist
 
 mkdir -p $dir/split$nj/
 
@@ -38,13 +45,13 @@ if [ $stage -le 1 ]; then
     $dir/split$nj/filelist.JOB $dir
 fi
 
-for x in `ls $SOURCE_DIR/1996_csr_hub4_model/st_train/*.stZ`; do
+for x in `ls $SOURCE_DIR/st_train/*.stZ`; do
   y=`basename $x`
   name=${y%.stZ}
   echo $dir/${name}.txt.gz
 done > $dir/train.filelist
 
-for x in `ls $SOURCE_DIR/1996_csr_hub4_model/st_test/*.stZ`; do
+for x in `ls $SOURCE_DIR/st_test/*.stZ`; do
   y=`basename $x`
   name=${y%.stZ}
   echo $dir/${name}.txt.gz
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh b/egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
new file mode 100755
index 00000000000..7c11531dda5
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
@@ -0,0 +1,99 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1996 English Broadcast News Dev and Eval (HUB4)
+# https://catalog.ldc.upenn.edu/LDC97S66
+
+set -e
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval data/local/data/hub4_96_dev_eval"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+for d in $SOURCE_DIR/dev/devdata $SOURCE_DIR/eval/evaldata; do 
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC97S66 corpus"
+    exit 1
+  fi
+done
+
+for d in dev eval; do 
+  if [ $d == "dev" ]; then
+    suffix=dt
+  else
+    suffix=ev
+  fi
+
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(None, line)
+  if line is not None:
+    print (line)' $SOURCE_DIR/${d}/${d}data/h496${suffix}.uem > $dir/${d}96_uem_segments
+  awk '{print $1" "$2}' $dir/${d}96_uem_segments > $dir/${d}96_uem_utt2spk
+done 
+
+for d in dev eval; do 
+  if [ $d == "dev" ]; then
+    suffix=dt
+  else
+    suffix=ev
+  fi
+
+  cat $SOURCE_DIR/${d}/${d}data/h496${suffix}.pem | \
+    python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line, prepend_reco_to_spk=True)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/${d}96_pem_segments $dir/${d}96_pem_utt2spk
+done
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+
+for x in `ls $SOURCE_DIR/dev/devdata/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/dev96_wav_scp
+
+cat $dir/dev96_pem_segments | awk '{print $2}' | \
+  utils/filter_scp.pl /dev/stdin $dir/dev96_wav_scp > $dir/dev96_pem_wav_scp
+cat $dir/dev96_uem_segments | awk '{print $2}' | \
+  utils/filter_scp.pl /dev/stdin $dir/dev96_wav_scp > $dir/dev96_uem_wav_scp
+
+for x in `ls $SOURCE_DIR/eval/evaldata/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/eval96_wav_scp
+
+cp $SOURCE_DIR/eval/evaldata/et96_1.glm $dir/glm
+
+cp $SOURCE_DIR/eval/evaldata/et96_1.utm $dir/eval96_utm
+cp $SOURCE_DIR/dev/devdata/et96_1.utm $dir/dev96_utm
+
+cp $SOURCE_DIR/eval/evaldata/h496ev.stm $dir/eval96_stm
+
+cp $SOURCE_DIR/dev/devdata/h496dtpe.stm $dir/dev96_pem_stm
+cp $SOURCE_DIR/dev/devdata/h496dtue.stm $dir/dev96_uem_stm
diff --git a/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py b/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py
new file mode 100755
index 00000000000..0dd9b4dca58
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py
@@ -0,0 +1,2 @@
+
+/export/corpora/LDC/LDC98T28
diff --git a/egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
new file mode 100755
index 00000000000..8ef0817065f
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
@@ -0,0 +1,64 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1997 HUB4 English Evaluation corpus
+# https://catalog.ldc.upenn.edu/LDC2002S11
+
+set -e
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora/LDC/LDC2002S11/hub4e_97 data/local/data/eval97"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then
+  echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC2002S11 corpus"
+  exit 1
+fi
+
+for uem in $SOURCE_DIR/h4e_evl/h4e_97.uem; do
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $uem
+done > $dir/segments
+awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
+
+cat $SOURCE_DIR/h4e_evl/h4e_97.seg | \
+  python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' $dir/segments.pem $dir/utt2spk.pem
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/wav.scp
+
+cp $SOURCE_DIR/h4e_evl/h4e_97_1.glm $dir/glm
+cp $SOURCE_DIR/h4e_evl/h4e_97.stm $dir/stm
diff --git a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
index f990adbd74a..ccefc3dcd66 100755
--- a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
+++ b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
@@ -3,6 +3,12 @@
 # Copyright 2016  Vimal Manohar
 # Apache 2.0.
 
+# This script prepares 1998 HUB4 Broadcast News Evaluation English Test Material
+# https://catalog.ldc.upenn.edu/LDC2000S86 
+
+set -e 
+set -o pipefail
+
 if [ $# -ne 2 ]; then
   echo "Usage: $0 <SOURCE-DIR> <dir>"
   echo "$0 /export/corpora/LDC/LDC2000S86/ data/local/data/eval98"
@@ -29,6 +35,7 @@ for line in open(uem).readlines():
   line = line.strip()
   print (parse_uem_line(line))' $uem
 done > $dir/segments
+awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
 
 cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \
   python -c '
@@ -48,16 +55,5 @@ for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
   echo "$z $sph2pipe -f wav $x |";
 done > $dir/wav.scp
 
-awk '{print $1" "$1" 1"}' $dir/wav.scp > $dir/reco2file_and_channel
-
 cp $SOURCE_DIR/h4e_evl/h4e_98.glm $dir/glm
 cp $SOURCE_DIR/h4e_evl/h4e_98.stm $dir/stm
-
-awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
-
-utils/fix_data_dir.sh $dir
-utils/copy_data_dir.sh $dir ${dir}.pem
-
-cp $dir/segments.pem ${dir}.pem/segments
-cp $dir/utt2spk.pem ${dir}.pem/utt2spk
-utils/fix_data_dir.sh ${dir}.pem
diff --git a/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
old mode 100644
new mode 100755
index 133b56b5b36..8a6d4d4b8ae
--- a/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
+++ b/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
@@ -3,9 +3,15 @@
 # Copyright 2016  Vimal Manohar
 # Apache 2.0.
 
+# This script prepares 1999 HUB4 Broadcast News Evaluation English Test Material
+# https://catalog.ldc.upenn.edu/LDC2000S88
+
+set -e 
+set -o pipefail
+
 if [ $# -ne 2 ]; then
   echo "Usage: $0 <SOURCE-DIR> <dir>"
-  echo "$0 /export/corpora5/LDC/LDC2000S88/ data/local/data/eval99"
+  echo "$0 /export/corpora5/LDC/LDC2000S88/hub4_1999 data/local/data/eval99"
   exit 1
 fi
 
@@ -14,53 +20,53 @@ dir=$2
 
 mkdir -p $dir
 
-if [ ! -d $SOURCE_DIR/hub4_1999/ ]; then
+if [ ! -d $SOURCE_DIR/bnews_99/ ]; then
   echo "$0: Invalid SOURCE-DIR for LDC2000S88 corpus"
   exit 1
 fi
 
-for uem in $SOURCE_DIR/hub4_1999/bnews_99/bn99en_{1,2}.uem; do
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+
+for f in bn99en_1 bn99en_2; do 
+  if [ $f == "bn99en_1" ]; then
+    affix=eval99_1
+  elif [ $z == "bn99en_2" ]; then
+    affix=eval99_2
+  fi
+
   python -c '
 import sys, os
+sys.path.insert(0, "local/data_prep")
 import hub4_utils
 uem = sys.argv[1]
 reco, ext = os.path.splitext(os.path.basename(uem))
 for line in open(uem).readlines():
-  print (parse_uem_line(line))' $uem
-done > $dir/segments
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $SOURCE_DIR/bnews_99/$f.uem > $dir/${affix}_uem_segments
 
-awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
+  awk '{print  $1" "$2}' $dir/${affix}_uem_segments > $dir/${affix}_uem_utt2spk
 
-cat $SOURCE_DIR/hub4_1999/bnews_99/bn99en_{1,2}.seg | \
-  python -c '
+  cat $SOURCE_DIR/bnews_99/$f.seg | \
+    python -c '
 import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
 with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
   for line in sys.stdin.readlines():
-    segments_line, utt2spk_line = parse_cmu_seg_line(reco, line)
-    s_f.write("{0}\n".format(segments_line))
-    u_f.write("{0}\n".format(utt2spk_line))' \
-      $dir/segments.pem $dir/utt2spk.pem
-
-export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
-sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
-for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
-  y=`basename $x`
-  z=${y%.sph}
-  echo "$z $sph2pipe -f wav $x |";
-done > $dir/wav.scp
-
-awk '{print $1" "$1" 1"}' $dir/wav.scp > $dir/reco2file_and_channel
-
-cp $SOURCE_DIR/hub4_1999/bnews99/en981118.glm $dir/en981118.glm
-cp $SOURCE_DIR/hub4_1999/bnews99/bn99en_1.stm $dir/bn99en_1.stm
-
-cp $SOURCE_DIR/hub4_1999/bnews99/en991231.glm $dir/en991231.glm
-cp $SOURCE_DIR/hub4_1999/bnews99/bn99en_2.stm $dir/bn99en_2.stm
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/${affix}_pem_segments $dir/${affix}_pem_uttspk
+  
+  echo "$f $sph2pipe -f wav $SOURCE_DIR/bnews_99/$f.sph |" > ${affix}_wav_scp
+done 
 
-utils/fix_data_dir.sh $dir
-utils/copy_data_dir.sh $dir ${dir}.pem
-cp $dir/*.stm ${dir}.pem/
+cp $SOURCE_DIR/bnews_99/en981118.glm $dir/eval98_2_glm
+cp $SOURCE_DIR/bnews_99/bn99en_1.stm $dir/eval99_1_stm
 
-cp $dir/segments.pem ${dir}.pem/segments
-cp $dir/utt2spk.pem ${dir}.pem/utt2spk
-utils/fix_data_dir.sh ${dir}.pem
+cp $SOURCE_DIR/bnews_99/en991231.glm $dir/eval99_2_glm
+cp $SOURCE_DIR/bnews_99/bn99en_2.stm $dir/eval99_2_stm
diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh
index 44138e2a228..c32e48a3d7e 100755
--- a/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh
+++ b/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh
@@ -3,6 +3,9 @@
 # Copyright 2016  Vimal Manohar
 # Apache 2.0.
 
+# This script prepares the North American News Text Corpus
+# https://catalog.ldc.upenn.edu/LDC95T21
+
 . cmd.sh
 . path.sh
 
diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh
new file mode 100644
index 00000000000..dd463df46fc
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh
@@ -0,0 +1,61 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the North American News Text Supplement Corpus
+# https://catalog.ldc.upenn.edu/LDC98T30
+
+. cmd.sh
+. path.sh
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <DIR>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T30/northam_news_txt_sup data/local/data/na_news_supp"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+for x in $SOURCE_DIR/nyt/*/ $SOURCE_DIR/latwp/ $SOURCE_DIR/apws/*/; do
+  year=`basename $x`
+  newspaper=`basename $(dirname $x)`
+
+  d=$dir/${newspaper}_${year}
+
+  if [ $year == latwp ]; then
+    d=$dir/latwp_1997
+  elif [ $year == english ]; then
+    d=$dir/apws
+  fi
+
+  mkdir -p $d
+
+  list_file=$d/articles.list
+  ls $x/*.gz > $list_file
+  
+  mkdir -p $d/split$nj
+
+  eval utils/split_scp.pl $d/articles.list \
+    $d/split$nj/articles.list.{`seq -s, $nj`}
+
+  (
+  $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \
+    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB - \| \
+    gzip -c '>' $d/corpus.JOB.gz  || exit 1
+  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
+  rm $d/corpus.*.gz
+  ) &
+done
+
+wait
diff --git a/egs/bn/s5/local/data_prep/process_1995_bn_annotation.py b/egs/bn/s5/local/data_prep/process_1995_bn_annotation.py
new file mode 100755
index 00000000000..be0c7ad8e0d
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/process_1995_bn_annotation.py
@@ -0,0 +1,273 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This script process a 1995 CSR-IV annotation file and writes to
+utt2spk, segments and text files.
+"""
+
+from __future__ import print_function
+import argparse
+import os
+import logging
+import re
+from bs4 import BeautifulSoup
+import hub4_utils
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    """Get command-line arguments"""
+
+    parser = argparse.ArgumentParser("Process 1995 CSR-IV HUB4 transcripts")
+
+    parser.add_argument("--noise-word", type=str, default="<NOISE>",
+                        help="Word to add in-place of noise words")
+    parser.add_argument("--spoken-noise-word", type=str,
+                        default="<SPOKEN_NOISE>",
+                        help="Word to add in-place of speaker noise words")
+    parser.add_argument("in_file", type=argparse.FileType('r'),
+                        help="Input transcript file")
+    parser.add_argument("segments_file", type=argparse.FileType('a'),
+                        help="Output segments file")
+    parser.add_argument("utt2spk_file", type=argparse.FileType('a'),
+                        help="Output utt2spk file")
+    parser.add_argument("text_file", type=argparse.FileType('a'),
+                        help="Output text file")
+
+    args = parser.parse_args()
+    return args
+
+
+class Segment(object):
+    """Class to store an utterance (segment)"""
+
+    def __init__(self, reco_id, spk=None, start_time=-1,
+                 end_time=-2, text=""):
+        """The arguments are straight-forward.
+        spk can be None if speaker is not known, in which case the utterance-id
+        and speaker-id are made the same.
+        end_time can be -1 to mean the end of the recording.
+        """
+        self.reco_id = reco_id
+        self.spk = spk
+        self.start_time = float(start_time)
+        self.end_time = float(end_time)
+        self.text = text
+
+    def get_utt_id(self):
+        """Return the utterance-id, which is
+        <recording-id>-<start-frame>-<end-frame> if spk is not known.
+        Otherwise it is speaker-id is added as a suffix to <recording-id>
+        above.
+        """
+        if self.spk is None:
+            return "{reco}-{0:06d}-{1:06d}".format(
+                int(self.start_time * 100), int(self.end_time * 100),
+                reco=self.reco_id)
+        return "{reco}-{spk}-{0:06d}-{1:06d}".format(
+            int(self.start_time * 100), int(self.end_time * 100),
+            reco=self.reco_id, spk=self.spk)
+
+    def get_spk_id(self):
+        """Returns the speaker-id appended to the recording-id, if speaker is
+        known. Otherwise returns the utterance-id as speaker-id.
+        """
+        if self.spk is None:
+            return "{reco}-{0:06d}-{1:06d}".format(
+                int(self.start_time * 100), int(self.end_time * 100),
+                reco=self.reco_id)
+        return "{reco}-{spk}".format(reco=self.reco_id, spk=self.spk)
+
+    def write_utt2spk(self, out_file):
+        """Writes this segment's entry into utt2spk file."""
+        print ("{0} {1}".format(self.get_utt_id(), self.get_spk_id()),
+               file=out_file)
+
+    def write_segment(self, out_file):
+        """Writes this segment's entry into segments file."""
+        print ("{0} {1} {2:.3f} {3:.3f}".format(
+                    self.get_utt_id(), self.reco_id,
+                    self.start_time, self.end_time),
+               file=out_file)
+
+    def write_text(self, out_file):
+        """Writes this segment's entry into kaldi text file."""
+        print ("{0} {1}".format(self.get_utt_id(), self.text),
+               file=out_file)
+
+
+def write_segments(segments, args):
+    """Write segments with non-empty transcripts."""
+    for segment in segments:
+        if len(segment.text) == 0:
+            continue
+        segment.write_utt2spk(args.utt2spk_file)
+        segment.write_segment(args.segments_file)
+        segment.write_text(args.text_file)
+
+
+def process_text(text, noise_word, spoken_noise_word):
+    """Returns normalized text"""
+    text = re.sub(r"\[pause\]", "", text)
+    text = hub4_utils.normalize_csr_transcript(text, noise_word,
+                                               spoken_noise_word)
+    return text
+
+
+test_spk_matcher = re.compile(r"(\S+)\(bt=(\S+)\set=(\S+)\):\s(.+)$")
+train_spk_matcher = re.compile(r"(\S+):\s(.+)$")
+
+
+def process_story_content(args, reco_id, content,
+                          start_time, end_time):
+    """Process the contents in a story and converts into a set of segments.
+
+    Arguments:
+        args -- A reference to the CLI arguments
+        reco_id -- Recording id
+        content -- A string containing all the contents of a story (or the
+                   stuff before the story like the credits and announcements).
+                   It is split on a double-newline characters.
+        start_time -- Start time of this 'story'.
+        end_time -- End time of this 'story'.
+    """
+
+    segments = []
+    segment_tmp = Segment(reco_id=reco_id, spk=None,
+                          start_time=start_time, end_time=-2, text="")
+
+    for line in content.split('\n\n'):
+        line = re.sub('\n', ' ', line)
+
+        if len(line) == 0 or re.match(r"\[[^]]+\]$|\s*$", line):
+            continue
+
+        m = test_spk_matcher.match(line)
+        if m:
+            # A line of story in test file that has start and end times
+            # and speaker name.
+            spk = m.group(1)
+            bt = float(m.group(2))
+            et = float(m.group(3))
+
+            # Once we know the end-time of the temporary segment, we can
+            # write that out (Only if it is non-empty).
+            if len(segment_tmp.text) > 0:
+                segment_tmp.end_time = bt
+            segments.append(segment_tmp)
+            segment_tmp = Segment(reco_id, spk=None, start_time=et)
+
+            text = process_text(m.group(4), args.noise_word,
+                                args.spoken_noise_word)
+            if len(text) == 0 or re.match(r"\[[^]]+\]$|\s*$", text):
+                continue
+            segments.append(Segment(reco_id=reco_id, spk=spk,
+                                    start_time=bt, end_time=et,
+                                    text=text))
+            continue
+
+        m = train_spk_matcher.match(line)
+        if m:
+            # A line of story in train file that has no time segment
+            # information. So speaker information is not useful.
+            text = process_text(m.group(2), args.noise_word,
+                                args.spoken_noise_word)
+        else:
+            # A line of story that does not have a speaker marking.
+            text = process_text(line, args.noise_word, args.spoken_noise_word)
+        if len(text) == 0 or re.match(r"\[[^]]+\]$|\s*$", text):
+            continue
+        segment_tmp.text += ' ' + text
+
+    if len(segment_tmp.text) > 0:
+        segment_tmp.end_time = end_time
+    segments.append(segment_tmp)
+
+    return segments
+
+
+def process_float(string):
+    string = re.sub(r"'|\"", "", string)
+    return float(string)
+
+
+def run(args):
+    base = os.path.basename(args.in_file.name)
+    reco_id = os.path.splitext(base)[0]
+
+    doc = ''.join(args.in_file.readlines())
+
+    soup = BeautifulSoup(doc, 'lxml')
+    for broadcast in soup.find_all('broadcast'):
+        non_story_contents = []
+        start_time = 0.0
+        end_time = -1.0
+        for s in broadcast.children:
+            try:
+                if s.name == 'story':
+                    story_begin_time = process_float(s['bt'])
+                    story_end_time = process_float(s['et'])
+                    for x in s.find_all('language') + s.find_all('sung'):
+                        x.replaceWithChildren()
+                    if len(non_story_contents):
+                        end_time = story_begin_time
+                        segments = process_story_content(
+                            args, reco_id, ' '.join(non_story_contents),
+                            start_time=start_time, end_time=end_time)
+                        write_segments(segments, args)
+                        non_story_contents = []
+                        start_time = story_end_time
+                    segments = process_story_content(
+                        args, reco_id,
+                        ' '.join([unicode(x) for x in s.children]),
+                        start_time=story_begin_time, end_time=story_end_time)
+                    write_segments(segments, args)
+                elif (s.name is not None and s.name != "language"
+                      and s.name != 'sung'):
+                    raise RuntimeError(
+                        "Expected a NavigableString or <story> "
+                        "or <language> or <sung>; got {0}".format(s))
+                elif s.name == "language" or s.name == "sung":
+                    non_story_contents.append(
+                        ' '.join([unicode(x) for x in s.children]))
+                else:
+                    non_story_contents.append(unicode(s))
+            except RuntimeError:
+                raise
+            except Exception:
+                logger.error("Failed to process broadcast children %s", s)
+                raise
+        # End for loop over broadcast children
+        if len(non_story_contents) > 0:
+            segments = process_story_content(
+                args, reco_id, ' '.join(non_story_contents),
+                start_time=start_time, end_time=-1)
+            write_segments(segments, args)
+
+
+def main():
+    try:
+        args = get_args()
+        run(args)
+    except Exception:
+        raise
+    finally:
+        for f in [args.in_file, args.segments_file,
+                  args.utt2spk_file, args.text_file]:
+            if f is not None:
+                f.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/bn/s5/local/data_prep/process_na_news_text.py b/egs/bn/s5/local/data_prep/process_na_news_text.py
index 10941dd3186..d7bb36aa3f7 100755
--- a/egs/bn/s5/local/data_prep/process_na_news_text.py
+++ b/egs/bn/s5/local/data_prep/process_na_news_text.py
@@ -41,17 +41,21 @@ def normalize_text(text):
     return text2
 
 
-def process_file(file_handle, out_file_handle):
-    doc = ' '.join(file_handle.readlines())
+def process_file_lines(lines, out_file_handle):
+    doc = ''
+    for line in lines:
+        line = re.sub(r"<artID>([^</])+</DOCID>", "", line)
+        line = re.sub(r"<p>", "<p></p>", line)
+        doc += line
     soup = BeautifulSoup(doc, 'lxml')
 
     num_written = 0
 
-    for doc in soup.html.body.children:
+    for art in soup.html.body.children:
         try:
-            if doc.name != "doc":
+            if art.name != "art":
                 continue
-            for para in doc.find_all('p'):
+            for para in art.find_all('p'):
                 assert para.name == 'p'
                 text = ' '.join([unicode(x).strip() for x in para.contents])
                 normalized_text = normalize_text(text)
@@ -69,8 +73,27 @@ def _run(args):
         for line in args.file_list.readlines():
             try:
                 file_ = line.strip()
-                with gzip.open(file_, 'r') as f:
-                    process_file(f, args.out_file)
+                p = run_command(
+                    "gunzip -c {0} | "
+                    "local/data_prep/csr_hub4_utils/pare-sgml.perl | "
+                    "perl local/data_prep/csr_hub4_utils/bugproc.perl | "
+                    "perl local/data_prep/csr_hub4_utils/numhack.perl | "
+                    "perl local/data_prep/csr_hub4_utils/numproc.perl "
+                    "  -xlocal/data_prep/csr_hub4_utils/num_excp | "
+                    "perl local/data_prep/csr_hub4_utils/abbrproc.perl "
+                    "  local/data_prep/csr_hub4_utils/abbrlist | "
+                    "perl local/data_prep/csr_hub4_utils/puncproc.perl -np"
+                    "".format(file_),
+                    stdout=subprocess.PIPE, shell=True)
+
+                stdout = p[0].communicate()[0]
+                if p[0].returncode is not 0:
+                    logger.error(
+                        "Command '%s' failed with return status %d",
+                        p[1], p[0].returncode)
+                    raise RuntimeError
+
+                process_file_lines(stdout, args.out_file)
             except Exception:
                 logger.error("Failed processing file %s", file_)
                 raise

From d275480af7c4ae310a673dd0fd9b7a00be5aeb01 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 10 Jan 2017 20:22:27 -0500
Subject: [PATCH 06/38] bn: Fix MFCC config

---
 egs/bn/s5/conf/mfcc.conf | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/egs/bn/s5/conf/mfcc.conf b/egs/bn/s5/conf/mfcc.conf
index a4be40be454..7361509099f 100644
--- a/egs/bn/s5/conf/mfcc.conf
+++ b/egs/bn/s5/conf/mfcc.conf
@@ -1,6 +1 @@
---sample-frequency=16000 
---frame-length=25 # the default is 25
---low-freq=20 # the default.
---high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
---num-ceps=20 # higher than the default which is 12.
---snip-edges=false
+--use-energy=false   # only non-default option.

From 4f94a5c563ed6bda7c18b537a15294bfb32eac85 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 10 Jan 2017 20:22:57 -0500
Subject: [PATCH 07/38] bn: Clean and update recipe

---
 egs/bn/s5/local/format_data.sh  |  90 ++++++++++++++--
 egs/bn/s5/local/format_lms.sh   |   7 +-
 egs/bn/s5/local/prepare_dict.sh | 184 ++++++++++++++++++--------------
 egs/bn/s5/local/train_lm.sh     | 154 ++++++++++++++++++--------
 4 files changed, 298 insertions(+), 137 deletions(-)

diff --git a/egs/bn/s5/local/format_data.sh b/egs/bn/s5/local/format_data.sh
index b7d58f83718..3f5bb29195d 100755
--- a/egs/bn/s5/local/format_data.sh
+++ b/egs/bn/s5/local/format_data.sh
@@ -11,18 +11,86 @@ echo "$0 $@"  # Print the command line for logging
 srcdir=data/local/data
 tmpdir=data/local/
 
-for t in train; do 
-  utils/fix_data_dir.sh $srcdir/$t
-  utils/copy_data_dir.sh $srcdir/$t data/$t
-  cat $srcdir/$t/text | \
-    local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" > \
-    data/$t/text
-  utils/fix_data_dir.sh data/$t
-done
+###############################################################################
+# Format 1996 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/train_bn96
+cp $srcdir/train_bn96/{wav.scp,segments,utt2spk} data/train_bn96
+
+###############################################################################
+# Format 1996 English Broadcast News Dev (HUB4)
+###############################################################################
+mkdir -p data/dev96pe 
+mkdir -p data/dev96ue
+
+cp $srcdir/hub4_96_dev_eval/dev96_uem_segments data/dev96ue/segments
+cp $srcdir/hub4_96_dev_eval/dev96_uem_utt2spk data/dev96ue/utt2spk
+cp $srcdir/hub4_96_dev_eval/dev96_uem_wav_scp data/dev96ue/wav.scp
+cp $srcdir/hub4_96_dev_eval/dev96_uem_stm data/dev96ue/stm
+cp $srcdir/hub4_96_dev_eval/glm data/dev96ue/glm
+
+awk '{if ($4 > $3) print $0}' $srcdir/hub4_96_dev_eval/dev96_pem_segments \
+  > data/dev96pe/segments
+cp $srcdir/hub4_96_dev_eval/dev96_pem_utt2spk data/dev96pe/utt2spk
+cp $srcdir/hub4_96_dev_eval/dev96_pem_wav_scp data/dev96pe/wav.scp
+cp $srcdir/hub4_96_dev_eval/dev96_pem_stm data/dev96pe/stm
+cp $srcdir/hub4_96_dev_eval/glm data/dev96pe/glm
+
+###############################################################################
+# Format 1996 English Broadcast News Eval (HUB4)
+###############################################################################
+mkdir -p data/eval96
+mkdir -p data/eval96.pem 
 
-for t in eval98 eval98.pem; do
-  utils/copy_data_dir.sh $srcdir/$t data/$t
-  utils/fix_data_dir.sh data/$t
+cp $srcdir/hub4_96_dev_eval/eval96_pem_segments data/eval96.pem/segments
+cp $srcdir/hub4_96_dev_eval/eval96_pem_utt2spk data/eval96.pem/utt2spk
+cp $srcdir/hub4_96_dev_eval/eval96_wav_scp data/eval96.pem/wav.scp
+cp $srcdir/hub4_96_dev_eval/eval96_stm data/eval96.pem/stm
+cp $srcdir/hub4_96_dev_eval/glm data/eval96.pem/glm
+
+cp $srcdir/hub4_96_dev_eval/eval96_uem_segments data/eval96/segments
+cp $srcdir/hub4_96_dev_eval/eval96_uem_utt2spk data/eval96/utt2spk
+cp $srcdir/hub4_96_dev_eval/eval96_wav_scp data/eval96/wav.scp
+cp $srcdir/hub4_96_dev_eval/eval96_stm data/eval96/stm
+cp $srcdir/hub4_96_dev_eval/glm data/eval96/glm
+
+###############################################################################
+# Format 1997-98 Hub4 Broadcast news evalutation
+###############################################################################
+for t in eval97 eval98; do
+  mkdir -p data/$t data/${t}.pem
+  cp $srcdir/$t/segments data/$t/segments
+  cp $srcdir/$t/utt2spk data/$t/utt2spk
+  cp $srcdir/$t/segments.pem data/${t}.pem/segments
+  cp $srcdir/$t/utt2spk.pem data/${t}.pem/utt2spk
+  cp $srcdir/$t/wav.scp data/$t/wav.scp
+  cp $srcdir/$t/wav.scp data/${t}.pem/wav.scp
+  cp $srcdir/$t/stm data/$t/stm
+  cp $srcdir/$t/stm data/${t}.pem/stm
+  cp $srcdir/$t/glm data/$t/glm
+  cp $srcdir/$t/glm data/${t}.pem/glm
 done
 
+###############################################################################
+# Format 1999 Hub4 Broadcast news evalutation
+###############################################################################
+for d in eval99_1 eval99_2; do
+  mkdir -p data/${d} data/${d}.pem
+  cp $srcdir/eval99/${d}_uem_segments data/${d}/segments
+  cp $srcdir/eval99/${d}_uem_utt2spk data/${d}/utt2spk
+  cp $srcdir/eval99/${d}_pem_segments data/${d}.pem/segments
+  cp $srcdir/eval99/${d}_pem_utt2spk data/${d}.pem/utt2spk
+  cp $srcdir/eval99/${d}_wav_scp data/${d}/wav.scp
+  cp $srcdir/eval99/${d}_wav_scp data/${d}.pem/wav.scp
+  cp $srcdir/eval99/${d}_stm data/${d}/stm
+  cp $srcdir/eval99/${d}_stm data/${d}.pem/stm
+  cp $srcdir/eval99/${d}_glm data/${d}/glm
+  cp $srcdir/eval99/${d}_glm data/${d}.pem/glm
+done
 
+for d in train_bn96 eval96 eval96.pem dev96pe dev96ue eval97 eval97.pem \
+         eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem; do
+  utils/utt2spk_to_spk2utt.pl data/$d/utt2spk > data/$d/spk2utt
+  awk '{print $1" "$1" 1"}' data/${d}/wav.scp > data/${d}/reco2file_and_channel
+  utils/fix_data_dir.sh data/${d}
+done
diff --git a/egs/bn/s5/local/format_lms.sh b/egs/bn/s5/local/format_lms.sh
index 7d9e3b82bfb..834e3d10d0a 100755
--- a/egs/bn/s5/local/format_lms.sh
+++ b/egs/bn/s5/local/format_lms.sh
@@ -8,12 +8,13 @@ if [ -f path.sh ]; then . path.sh; fi
 set -e -o pipefail -u
 
 lang_suffix=_test
+local_lm_dir=data/local/local_lm
 
 . utils/parse_options.sh
 
-#arpa_lm=data/local/local_lm/data/arpa/4gram.arpa.gz
-small_arpa_lm=data/local/local_lm/data/arpa/4gram_small.arpa.gz
-big_arpa_lm=data/local/local_lm/data/arpa/4gram_big.arpa.gz
+#arpa_lm=$local_lm_dir/data/arpa/4gram.arpa.gz
+small_arpa_lm=$local_lm_dir/data/arpa/4gram_small.arpa.gz
+big_arpa_lm=$local_lm_dir/data/arpa/4gram_big.arpa.gz
 
 for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
diff --git a/egs/bn/s5/local/prepare_dict.sh b/egs/bn/s5/local/prepare_dict.sh
index 441849329e1..d0a6a6703e4 100755
--- a/egs/bn/s5/local/prepare_dict.sh
+++ b/egs/bn/s5/local/prepare_dict.sh
@@ -38,6 +38,7 @@ set -u
 
 # run this from ../
 dict_suffix=
+stage=-1
 
 echo "$0 $@"  # Print the command line for logging
 . utils/parse_options.sh || exit 1;
@@ -65,53 +66,63 @@ fi
 #(2) Dictionary preparation:
 
 
-# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
-# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
+if [ $stage -le 0 ]; then
+  # Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
+  # We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
 
-# silence phones, one per line.
-(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
-echo SIL > $dir/optional_silence.txt
+  # silence phones, one per line.
+  (echo SIL; echo SPN; echo NSN; echo UNK;) > $dir/silence_phones.txt
+  echo SIL > $dir/optional_silence.txt
 
-# nonsilence phones; on each line is a list of phones that correspond
-# really to the same base phone.
-cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
- perl -e 'while(<>){
+  # nonsilence phones; on each line is a list of phones that correspond
+  # really to the same base phone.
+  cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
+    perl -e 'while(<>){
   chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 
   $phones_of{$1} .= "$_ "; }
   foreach $list (values %phones_of) {print $list . "\n"; } ' \
-  > $dir/nonsilence_phones.txt || exit 1;
+    > $dir/nonsilence_phones.txt || exit 1;
 
-# A few extra questions that will be added to those obtained by automatically clustering
-# the "real" phones.  These ask about stress; there's also one for silence.
-cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
-cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  # A few extra questions that will be added to those obtained by automatically clustering
+  # the "real" phones.  These ask about stress; there's also one for silence.
+  cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
+  cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
   $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
- >> $dir/extra_questions.txt || exit 1;
+    >> $dir/extra_questions.txt || exit 1;
 
-grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
- perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
-  > $dir/dict.cmu || exit 1;
+  grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
+    perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
+    > $dir/dict.cmu || exit 1;
 
-# Add to cmudict the silences, noises etc.
+  # Add to cmudict the silences, noises etc.
 
-(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<unk> SPN'; echo '<NOISE> NSN'; ) | \
- cat - $dir/dict.cmu > $dir/lexicon2_raw.txt 
-awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist
+  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<unk> UNK'; echo '<NOISE> NSN'; ) | \
+    cat - $dir/dict.cmu > $dir/lexicon2_raw.txt 
+  awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist
 
-cat <<EOF >$dir/silence_phones.txt 
+  cat <<EOF >$dir/silence_phones.txt 
 SIL
 SPN
 NSN
+UNK
 EOF
 
-if [ ! -f exp/g2p/.done ]; then
-  steps/dict/train_g2p.sh --cmd "$train_cmd" \
-    --silence-phones $dir/silence_phones.txt \
-    $dir/dict.cmu exp/g2p
-  touch exp/g2p/.done
 fi
 
-cat $wordlist | python -c '
+
+if [ $stage -le 2 ]; then
+  if [ ! -f exp/g2p/.done ]; then
+    steps/dict/train_g2p.sh --cmd "$train_cmd" \
+      --silence-phones $dir/silence_phones.txt \
+      $dir/dict.cmu exp/g2p
+    touch exp/g2p/.done
+  fi
+fi
+
+export PATH=$PATH:`pwd`/local/dict
+
+if [ $stage -le 3 ]; then
+  cat $wordlist | python -c '
 import sys
 
 words = {}
@@ -126,65 +137,76 @@ for line in sys.stdin.readlines():
 
 for oov in oovs:
   print (oov)' $dir/orig_wordlist | sort -u > $dir/oovlist
-
-export PATH=$PATH:`pwd`/local/dict
-
-cat $dir/oovlist | get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms
+  
+  cat $dir/oovlist | \
+    get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms
+fi
 
 mkdir -p $dir/f $dir/b # forward, backward directions of rules...
+
+if [ $stage -le 4 ]; then
   # forward is normal suffix
   # rules, backward is reversed (prefix rules).  These
   # dirs contain stuff we create while making the rule-based
   # extensions to the dictionary.
 
-# Remove ; and , from words, if they are present; these
-# might crash our scripts, as they are used as separators there.
-filter_dict.pl $dir/dict.cmu > $dir/f/dict
-cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
-reverse_dict.pl $dir/f/dict > $dir/b/dict
-reverse_dict.pl $dir/f/oovs > $dir/b/oovs
-
-# The next stage takes a few minutes.
-# Note: the forward stage takes longer, as English is
-# mostly a suffix-based language, and there are more rules
-# that it finds.
-for d in $dir/f $dir/b; do
- (
-   cd $d
-   cat dict | get_rules.pl 2>get_rules.log >rules
-   get_rule_hierarchy.pl rules >hierarchy
-   awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
-     limit_candidate_prons.pl hierarchy | \
-     score_prons.pl dict | \
-     count_rules.pl >rule.counts
-   # the sort command below is just for convenience of reading.
-   score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
-   get_candidate_prons.pl rules.with_scores dict oovs | \
-     limit_candidate_prons.pl hierarchy > oovs.candidates
- ) &
-done
-wait
-
-# Merge the candidates.
-reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
-select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
-  > $dir/dict.oovs
-
-cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
-awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
-sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
-
-steps/dict/apply_g2p.sh --cmd "$train_cmd" \
-  $dir/oovlist.not_handled exp/g2p exp/g2p/oov_lex
-cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \
-  $dir/dict.oovs_g2p
-
-# the sort | uniq is to remove a duplicated pron from cmudict.
-cat $dir/lexicon2_raw.txt $dir/dict.oovs_merged $dir/dict.oovs_g2p | sort | uniq > \
-  $dir/lexicon.txt || exit 1;
-# lexicon.txt is without the _B, _E, _S, _I markers.
-
-rm $dir/lexiconp.txt 2>/dev/null || true
+  # Remove ; and , from words, if they are present; these
+  # might crash our scripts, as they are used as separators there.
+  filter_dict.pl $dir/dict.cmu > $dir/f/dict
+  cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
+  reverse_dict.pl $dir/f/dict > $dir/b/dict
+  reverse_dict.pl $dir/f/oovs > $dir/b/oovs
+fi
+
+if [ $stage -le 5 ]; then
+  # The next stage takes a few minutes.
+  # Note: the forward stage takes longer, as English is
+  # mostly a suffix-based language, and there are more rules
+  # that it finds.
+  for d in $dir/f $dir/b; do
+   (
+     cd $d
+     cat dict | get_rules.pl 2>get_rules.log >rules
+     get_rule_hierarchy.pl rules >hierarchy
+     awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
+       limit_candidate_prons.pl hierarchy | \
+       score_prons.pl dict | \
+       count_rules.pl >rule.counts
+     # the sort command below is just for convenience of reading.
+     score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
+     get_candidate_prons.pl rules.with_scores dict oovs | \
+       limit_candidate_prons.pl hierarchy > oovs.candidates
+   ) &
+  done
+  wait
+fi
+
+if [ $stage -le 6 ]; then
+  # Merge the candidates.
+  reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
+  select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
+    > $dir/dict.oovs
+
+  cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
+  awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
+  sort $dir/oovlist | { diff - $dir/oovlist.handled || true; } | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
+fi
+
+if [ $stage -le 7 ]; then
+  steps/dict/apply_g2p.sh --cmd "$train_cmd" \
+    $dir/oovlist.not_handled exp/g2p exp/g2p/oov_lex
+  cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \
+    $dir/dict.oovs_g2p
+fi
+
+if [ $stage -le 8 ]; then
+  # the sort | uniq is to remove a duplicated pron from cmudict.
+  cat $dir/lexicon2_raw.txt $dir/dict.oovs_merged $dir/dict.oovs_g2p | sort | uniq > \
+    $dir/lexicon.txt || exit 1;
+  # lexicon.txt is without the _B, _E, _S, _I markers.
+
+  rm $dir/lexiconp.txt 2>/dev/null || true
+fi
 
 echo "Dictionary preparation succeeded"
 
diff --git a/egs/bn/s5/local/train_lm.sh b/egs/bn/s5/local/train_lm.sh
index d8523ca30f4..8f82fe397e0 100755
--- a/egs/bn/s5/local/train_lm.sh
+++ b/egs/bn/s5/local/train_lm.sh
@@ -14,11 +14,13 @@ set -o pipefail
 set -u
 
 stage=0
+dir=data/local/local_lm
+cmd=run.pl
+vocab_size=   # Preferred vocabulary size
 
 echo "$0 $@"  # Print the command line for logging
 . utils/parse_options.sh || exit 1;
 
-dir=data/local/local_lm
 lm_dir=${dir}/data
 
 mkdir -p $dir
@@ -46,23 +48,43 @@ if [ $stage -le 0 ]; then
 
   rm ${dir}/data/text/* 2>/dev/null || true
 
-  cat data/train/text | shuf > ${dir}/train_text
-  head -n $num_dev_sentences < ${dir}/train_text | cut -d ' ' -f 2- > ${dir}/data/text/dev.txt 
-  tail -n +$[num_dev_sentences+1] < ${dir}/train_text | cut -d ' ' -f 2- > ${dir}/data/text/bn.txt
+  cat data/train_bn96/text | shuf > ${dir}/train_bn96_text
+  head -n $num_dev_sentences < ${dir}/train_bn96_text | cut -d ' ' -f 2- > \
+    ${dir}/data/text/dev.txt 
+  tail -n +$[num_dev_sentences+1] < ${dir}/train_text | cut -d ' ' -f 2- > \
+    ${dir}/data/text/train_bn96.txt
 
+  # Get text from NA News corpus 
   for x in data/local/data/na_news/*; do
     y=`basename $x`
     [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
   done
 
+  # Get text from 1996 CSR HUB4 LM corpus
+  for x in `cat data/local/data/csr96_hub4/{train,test}.filelist`; do
+    gunzip -c $x
+  done | gzip -c > ${dir}/data/text/csr96_hub4.txt.gz
+  
+  # Get text from 1995 CSR-IV HUB4 corpus
+  cat data/local/data/csr95_hub4/dev95_text \
+    data/local/data/csr95_hub4/eval95_text \
+    data/local/data/csr95_hub4/train95_text | cut -d ' ' -f 2- > \
+    ${dir}/data/text/csr95_hub4.txt
+
+  # # Get text from NA News supplement corpus 
+  # for x in data/local/data/na_news/*; do
+  #   y=`basename $x`
+  #   [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
+  # done
+
   # for reporting perplexities, we'll use the "real" dev set.
-  # (a subset of the training data is used as ${dir}/data/text/ted.txt to work
-  # out interpolation weights.
   # note, we can't put it in ${dir}/data/text/, because then pocolm would use
   # it as one of the data sources.
-  cat data/eval98/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \
-    local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" | \
-    cut -d ' ' -f 2- > ${dir}/data/real_dev_set.txt
+  for x in dev96pe dev96ue eval96 eval97 eval98 eval99_1 eval99_2; do
+    cat data/$x/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \
+      local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" | \
+      cut -d ' ' -f 2- > ${dir}/data/${x}.txt
+  done
 fi
 
 if [ $stage -le 1 ]; then
@@ -74,59 +96,97 @@ if [ $stage -le 1 ]; then
 fi
 
 if [ $stage -le 2 ]; then
-  for x in data/local/data/na_news/*; do
-    y=$dir/data/work/word_counts/`basename $x`.counts
-    [ -f $y ] && cat $y 
-  done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts
-
-  cat $dir/data/work/word_counts/{bn,dev}.counts | \
-    local/lm/merge_word_counts.py 2 > $dir/data/work/bn.wordlist_counts
+  # decide on the vocabulary.
 
-  cat $dir/data/work/na_news.wordlist_counts $dir/data/work/bn.wordlist_counts | \
-    perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[1]\n"; }' | \
-    sort -u > $dir/data/work/wordlist
+  # NA news corpus is not clean. So better not to get vocabulary from there.
+  # for x in data/local/data/na_news/*; do
+  #   y=$dir/data/work/word_counts/`basename $x`.counts
+  #   [ -f $y ] && cat $y 
+  # done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts
+
+  cat $dir/data/work/word_counts/{train_bn96,dev}.counts | \
+    local/lm/merge_word_counts.py 2 > $dir/data/work/train_bn96.wordlist_counts
+
+  cat $dir/data/work/word_counts/csr96_hub4_{tr,ts}.counts | \
+    local/lm/merge_word_counts.py 5 > $dir/data/work/csr96_hub4.wordlist_counts
+
+  cat $dir/data/work/word_counts/csr95_hub4.counts | \
+    local/lm/merge_word_counts.py 5 > $dir/data/work/csr95_hub4.wordlist_counts
+
+  cat $dir/data/work/{train_bn96,csr96_hub4,csr95_hub4}.wordlist_counts | \
+    perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[0] $F[1]\n"; }' | \
+    local/lm/merge_word_counts.py 1 | sort -k 1,1nr > $dir/data/work/final.wordlist_counts
+
+  if [ ! -z "$vocab_size" ]; then
+    awk -v sz=$vocab_size 'BEGIN{count=-1;} 
+    { i+=1; 
+      if (i == int(sz)) { 
+        count = $1; 
+      };
+      if (count > 0 && count != $1) { 
+        exit(0); 
+      } 
+      print $0;
+    }' $dir/data/work/final.wordlist_counts
+  else 
+    cat $dir/data/work/final.wordlist_counts
+  fi | awk '{print $2}' > $dir/data/work/wordlist
 fi
 
 order=4
 wordlist=$dir/data/work/wordlist
 
-min_counts='default=5 bn=1'
+min_counts='default=5 train_bn96=1 csr96_hub4=2,3 csr95_hub4=2,3'
 
 lm_name="`basename ${wordlist}`_${order}"
 if [ -n "${min_counts}" ]; then
-  lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "," "." | tr "=" "-"`"
 fi
 unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
 
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+
 if [ $stage -le 3 ]; then
-  # decide on the vocabulary.
-  # Note: if you have more than one order, use a certain amount of words as the
-  # vocab and want to restrict max memory for 'sort',
   echo "$0: training the unpruned LM"
-  train_lm.py  --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20  \
-               --limit-unk-history=true \
-               --fold-dev-into=bn \
-               --min-counts="${min_counts}" \
-               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
-
-  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
-  #[perplexity = 157.87] over 18290.0 words
+
+  $cmd ${unpruned_lm_dir}/log/train.log \
+    train_lm.py  --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20  \
+                 --limit-unk-history=true \
+                 --fold-dev-into=train_bn96 \
+                 --min-counts="${min_counts}" \
+                 ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}_set.log \
+      get_data_prob.py ${dir}/data/${x}_set.txt ${unpruned_lm_dir} 
+
+    cat ${unpruned_lm_dir}/log/compute_data_prob_${x}_set.log | grep -F '[perplexity'
+  done
   
-  mkdir -p ${dir}/data/arpa
-  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram.arpa.gz
+  # train_lm.py: Ngram counts: 190742 + 31139856 + 14766071 + 13851899 = 59948568
+  # train_lm.py: You can set --bypass-metaparameter-optimization='1.000,0.007,0.000,0.002,0.000,0.006,0.003,0.000,0.000,0.000,0.001,0.002,0.002,0.000,0.000,0.000,0.003,0.000,0.000,0.604,0.187,0.044,0.012,1.000,0.490,0.026,0.001,0.732,0.328,0.281,0.218' to get equivalent results
+  # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/wordlist_4_default-5_bn-1.pocolm was -4.9927348506 per word [perplexity = 147.338822662] over 33180.0 words.
 fi
 
 if [ $stage -le 4 ]; then
   echo "$0: pruning the LM (to larger size)"
   # Using 10 million n-grams for a big LM for rescoring purposes.
   size=10000000
-  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+  $cmd ${dir}/data/lm_${order}_prune_big/log/prune_lm.log \
+    prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 \
+    ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
 
-  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}_set.log \
+      get_data_prob.py ${dir}/data/${x}_set.txt ${dir}/data/lm_${order}_prune_big
 
-  # current results, after adding --limit-unk-history=true:
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words.
+    cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}_set.log | grep -F '[perplexity'
+  done
 
+  # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big
+  # grep -F '[perplexity'
+  # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big was -5.05700399638 per word [perplexity = 157.11908113]
+  # over 33180.0 words.
 
   mkdir -p ${dir}/data/arpa
   format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
@@ -137,12 +197,22 @@ if [ $stage -le 5 ]; then
   # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
   # bigger-pruned LM, it'll be faster.
   size=2000000
-  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+  
+  $cmd ${dir}/data/lm_${order}_prune_small/log/prune_lm.log \
+    prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big \
+    ${dir}/data/lm_${order}_prune_small
 
-  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}_set.log \
+      get_data_prob.py ${dir}/data/${x}_set.txt ${dir}/data/lm_${order}_prune_big
+
+    cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}_set.log | grep -F '[perplexity'
+  done
 
-  # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst):
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words.
+  # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small
+  # grep -F '[perplexity'
+  # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small was -5.27172473478 per word [perplexity = 194.751567749] over 33180.0 words.
+  # float-counts-to-pre-arpa: output [ 190743 673670 802551 351512 ] n-grams
 
   format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
 fi

From eb6fccbbf14d95a61f1d47c941ede41504a09d2e Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 10 Jan 2017 20:23:59 -0500
Subject: [PATCH 08/38] bn: Remove local/lm/text_normalization.py

---
 egs/bn/s5/local/lm/text_normalization.py | 42 ------------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 egs/bn/s5/local/lm/text_normalization.py

diff --git a/egs/bn/s5/local/lm/text_normalization.py b/egs/bn/s5/local/lm/text_normalization.py
deleted file mode 100644
index f74da60a6ef..00000000000
--- a/egs/bn/s5/local/lm/text_normalization.py
+++ /dev/null
@@ -1,42 +0,0 @@
-
-# Copyright 2016    Vimal Manohar
-# Apache 2.0.
-
-"""This module contains methods for doing text normalization of broadcast news
-and similar text corpora.
-"""
-
-import re
-
-
-def normalize_bn_transcript(text, noise_word, spoken_noise_word):
-    """Normalize broadcast news transcript for audio."""
-    text.upper()
-    # Remove unclear speech markings
-    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
-    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
-    # Remove invented word markings
-    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
-    text = re.sub(r"\[[^]]+\]", noise_word, text)
-    text = re.sub(r"\{[^}]+\}", spoken_noise_word, text)
-    text = re.sub(r"\+([^+]+)\+", r"\1", text)
-
-    text1 = []
-    for word in text.split():
-        # Remove mispronunciation brackets
-        word = re.sub(r"^@(\w+)$", r"\1", word)
-        text1.append(word)
-    return " ".join(text1)
-
-
-def remove_punctuations(text):
-    """Remove punctuations and some other processing for text sentence."""
-    text1 = re.sub("\n", " ", text)
-    text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
-    text1 = re.sub(r"''|``|\(|\)", " ", text1)
-    text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
-    text1 = re.sub(r"\. ", " ", text1)
-    text1 = re.sub(r"([^0-9$-])\.([^0-9]|$)", r"\1\2", text1)
-    text1 = re.sub(r" - ", " ", text1)
-    text1 = re.sub(r"[ ]+", " ", text1)
-    return text1

From 351b447291e4a128eccfe66a1f590781a67b222c Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 10 Jan 2017 20:24:48 -0500
Subject: [PATCH 09/38] bn: Fix normalize_transcripts

---
 egs/bn/s5/local/normalize_transcripts.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/bn/s5/local/normalize_transcripts.pl b/egs/bn/s5/local/normalize_transcripts.pl
index cccf75def4a..069476cbc37 100755
--- a/egs/bn/s5/local/normalize_transcripts.pl
+++ b/egs/bn/s5/local/normalize_transcripts.pl
@@ -37,8 +37,8 @@
     $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
     $trans =~ s:\[[^]]+\]:$noise_word :g; 
     $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
     foreach $w (split (" ",$trans)) {
-        $w =~ s:^[+](.+)[+]$:$1:;   # Remove mispronunciation brackets
         $w =~ s:^@(.*)$:$1:;  # Remove best guesses for proper nouns
         print " $w";
     }

From 643881e17ce01b8eb0bf3d5059759893eca5a948 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 12 Jan 2017 15:10:11 -0500
Subject: [PATCH 10/38] bn: Updated recipe to add more LM corpora

---
 egs/bn/s5/local/prepare_dict.sh | 14 +++----
 egs/bn/s5/local/train_lm.sh     | 66 ++++++++++++++++++++++++++-------
 2 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/egs/bn/s5/local/prepare_dict.sh b/egs/bn/s5/local/prepare_dict.sh
index d0a6a6703e4..c0b2e7c0174 100755
--- a/egs/bn/s5/local/prepare_dict.sh
+++ b/egs/bn/s5/local/prepare_dict.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2010-2012 Microsoft Corporation  
+# Copyright 2010-2012 Microsoft Corporation
 #           2012-2014 Johns Hopkins University (Author: Daniel Povey)
 #                2015 Guoguo Chen
 #                2016 Vimal Manohar
@@ -30,9 +30,9 @@
 # silence_phones.txt
 
 . path.sh
-. cmd.sh 
+. cmd.sh
 
-set -e 
+set -e
 set -o pipefail
 set -u
 
@@ -78,7 +78,7 @@ if [ $stage -le 0 ]; then
   # really to the same base phone.
   cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
     perl -e 'while(<>){
-  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 
+  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
   $phones_of{$1} .= "$_ "; }
   foreach $list (values %phones_of) {print $list . "\n"; } ' \
     > $dir/nonsilence_phones.txt || exit 1;
@@ -97,10 +97,10 @@ if [ $stage -le 0 ]; then
   # Add to cmudict the silences, noises etc.
 
   (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<unk> UNK'; echo '<NOISE> NSN'; ) | \
-    cat - $dir/dict.cmu > $dir/lexicon2_raw.txt 
+    cat - $dir/dict.cmu > $dir/lexicon2_raw.txt
   awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist
 
-  cat <<EOF >$dir/silence_phones.txt 
+  cat <<EOF >$dir/silence_phones.txt
 SIL
 SPN
 NSN
@@ -137,7 +137,7 @@ for line in sys.stdin.readlines():
 
 for oov in oovs:
   print (oov)' $dir/orig_wordlist | sort -u > $dir/oovlist
-  
+
   cat $dir/oovlist | \
     get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms
 fi
diff --git a/egs/bn/s5/local/train_lm.sh b/egs/bn/s5/local/train_lm.sh
index 8f82fe397e0..6522619cf77 100755
--- a/egs/bn/s5/local/train_lm.sh
+++ b/egs/bn/s5/local/train_lm.sh
@@ -37,7 +37,7 @@ export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
  fi
 ) || exit 1;
 
-num_dev_sentences=5000
+num_dev_sentences=4500
 RANDOM=0
 
 if [ $stage -le 0 ]; then
@@ -48,10 +48,15 @@ if [ $stage -le 0 ]; then
 
   rm ${dir}/data/text/* 2>/dev/null || true
 
-  cat data/train_bn96/text | shuf > ${dir}/train_bn96_text
-  head -n $num_dev_sentences < ${dir}/train_bn96_text | cut -d ' ' -f 2- > \
+  # Take unique subset to make sure that the training text is not in the 
+  # dev set.
+  cat data/train_bn96/text | cut -d ' ' -f 2- | sort | uniq -c | \
+    shuf > ${dir}/train_bn96_text
+  head -n $num_dev_sentences < ${dir}/train_bn96_text | \
+    awk '{str=$2; for (i=3;i<=NF;i++) {str = str" "$i;}; for (i=0; i<$1; i++) {print str;} }' | cut -d ' ' -f 2- > \
     ${dir}/data/text/dev.txt 
-  tail -n +$[num_dev_sentences+1] < ${dir}/train_text | cut -d ' ' -f 2- > \
+  tail -n +$[num_dev_sentences+1] < ${dir}/train_bn96_text | \
+    awk '{str=$2; for (i=3;i<=NF;i++) {str = str" "$i;}; for (i=0; i<$1; i++) {print str;} }' | cut -d ' ' -f 2- > \
     ${dir}/data/text/train_bn96.txt
 
   # Get text from NA News corpus 
@@ -107,7 +112,7 @@ if [ $stage -le 2 ]; then
   cat $dir/data/work/word_counts/{train_bn96,dev}.counts | \
     local/lm/merge_word_counts.py 2 > $dir/data/work/train_bn96.wordlist_counts
 
-  cat $dir/data/work/word_counts/csr96_hub4_{tr,ts}.counts | \
+  cat $dir/data/work/word_counts/csr96_hub4.counts | \
     local/lm/merge_word_counts.py 5 > $dir/data/work/csr96_hub4.wordlist_counts
 
   cat $dir/data/work/word_counts/csr95_hub4.counts | \
@@ -157,16 +162,33 @@ if [ $stage -le 3 ]; then
                  ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
 
   for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
-    $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}_set.log \
-      get_data_prob.py ${dir}/data/${x}_set.txt ${unpruned_lm_dir} 
+    $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} 
 
-    cat ${unpruned_lm_dir}/log/compute_data_prob_${x}_set.log | grep -F '[perplexity'
+    cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity'
   done
   
   # train_lm.py: Ngram counts: 190742 + 31139856 + 14766071 + 13851899 = 59948568
   # train_lm.py: You can set --bypass-metaparameter-optimization='1.000,0.007,0.000,0.002,0.000,0.006,0.003,0.000,0.000,0.000,0.001,0.002,0.002,0.000,0.000,0.000,0.003,0.000,0.000,0.604,0.187,0.044,0.012,1.000,0.490,0.026,0.001,0.732,0.328,0.281,0.218' to get equivalent results
   # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/wordlist_4_default-5_bn-1.pocolm was -4.9927348506 per word [perplexity = 147.338822662] over 33180.0 words.
+
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.92985727862 per word [perplexity = 138.359764034] over 23760.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88171588624 per word [perplexity = 131.85672102] over 18821.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.85089075845 per word [perplexity = 127.85422637] over 20625.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.84370861758 per word [perplexity = 126.939248987] over 33340.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.91000862327 per word [perplexity = 135.640584068] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.03738768271 per word [perplexity = 154.067016944] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.02574438024 per word [perplexity = 152.283570813] over 16395.0 words.
+
 fi
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} 
+
+    cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+
+  
 
 if [ $stage -le 4 ]; then
   echo "$0: pruning the LM (to larger size)"
@@ -177,10 +199,10 @@ if [ $stage -le 4 ]; then
     ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
 
   for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
-    $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}_set.log \
-      get_data_prob.py ${dir}/data/${x}_set.txt ${dir}/data/lm_${order}_prune_big
+    $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_big
 
-    cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}_set.log | grep -F '[perplexity'
+    cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log | grep -F '[perplexity'
   done
 
   # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big
@@ -188,6 +210,14 @@ if [ $stage -le 4 ]; then
   # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big was -5.05700399638 per word [perplexity = 157.11908113]
   # over 33180.0 words.
 
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_big was -5.00197658249 per word [perplexity = 148.706800062] over 23760.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_big was -4.95522131024 per word [perplexity = 141.914009921] over 18821.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_big was -4.91668501333 per word [perplexity = 136.54920329] over 20625.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_big was -4.92810468806 per word [perplexity = 138.117488385] over 33340.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_big was -4.98326999699 per word [perplexity = 145.950861062] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10923357186 per word [perplexity = 165.543429098] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10475193474 per word [perplexity = 164.803183515] over 16395.0 words.
+
   mkdir -p ${dir}/data/arpa
   format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
 fi
@@ -203,10 +233,10 @@ if [ $stage -le 5 ]; then
     ${dir}/data/lm_${order}_prune_small
 
   for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
-    $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}_set.log \
-      get_data_prob.py ${dir}/data/${x}_set.txt ${dir}/data/lm_${order}_prune_big
+    $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_small
 
-    cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}_set.log | grep -F '[perplexity'
+    cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log | grep -F '[perplexity'
   done
 
   # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small
@@ -214,6 +244,14 @@ if [ $stage -le 5 ]; then
   # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small was -5.27172473478 per word [perplexity = 194.751567749] over 33180.0 words.
   # float-counts-to-pre-arpa: output [ 190743 673670 802551 351512 ] n-grams
 
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_small was -5.15402161616 per word [perplexity = 173.126339858] over 23760.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_small was -5.10689797354 per word [perplexity = 165.157237313] over 18821.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_small was -5.07740442667 per word [perplexity = 160.357296176] over 20625.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_small was -5.09747614277 per word [perplexity = 163.608461382] over 33340.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_small was -5.13563068716 per word [perplexity = 169.971484911] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26596417642 per word [perplexity = 193.632915104] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26092885453 per word [perplexity = 192.660361662] over 16395.0 words.
+
   format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
 fi
 

From 6f316ef4a1c5d3aab6d4cdce238fa869b9f58b6a Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 12 Jan 2017 15:11:23 -0500
Subject: [PATCH 11/38] bn: Updating main recipe

---
 egs/bn/s5/run.sh | 91 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 75 insertions(+), 16 deletions(-)

diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh
index 24c47cb90ba..6ed0a913b01 100755
--- a/egs/bn/s5/run.sh
+++ b/egs/bn/s5/run.sh
@@ -13,18 +13,46 @@ set -o pipefail
 mfccdir=`pwd`/mfcc
 nj=40
 
-local/data_prep/prepare_bn_data.py --split-at-sync=false \
-  /export/corpora5/LDC/LDC97S44 \
-  /export/corpora/LDC/LDC97T22 data/local/data/train
+false && {
 
-local/data_prep/prepare_na_news_test_corpus.sh --nj 40 --cmd "$train_cmd" \
+# Prepare 1996 English Broadcast News Train (HUB4)
+local/data_prep/prepare_1996_bn_data.py --noise-word="<NOISE>" \
+  --spoken-noise-word="<SPOKEN_NOISE>" \
+  /export/corpora/LDC/LDC97S44 /export/corpora/LDC/LDC97T22 \
+  data/local/data/train_bn96
+
+# Prepare 1995 CSR-IV HUB4 corpus
+local/data_prep/prepare_1995_csr_hub4_corpus.sh \
+  /export/corpora5/LDC/LDC96S31/csr95_hub4/ data/local/data/csr95_hub4
+
+# Prepare North American News Text Corpus
+local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
   /export/corpora/LDC/LDC95T21 data/local/data/na_news
 
-local/data_prep/prepare_1996_csr_hub4_corpus.sh --nj 10 --cmd "$train_cmd" \
-  /export/corpora/LDC/LDC98T31 data/local/data/csr96_hub4
+# Prepare North American News Text Supplement Corpus
+local/data/prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
+  /export/corpura/LCD/LDC98T30/northam_news_txt_sup data/local/data/na_news_supp
+
+# Prepare 1996 CSR HUB4 Language Model
+local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
+  /export/corpora/LDC/LDC98T31/1996_csr_hub4_model data/local/data/csr96_hub4
+
+# Prepare 1996 English Broadcast News Dev and Eval (HUB4)
+local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
+  /export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval \
+  data/local/data/hub4_96_dev_eval
 
-local/prepare_1998_hub4_bn_eng_eval.sh /export/corpora/LDC/LDC2000S86/ \
-  data/local/data/eval98
+# Prepare 1997 HUB4 English Evaluation corpus
+local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \
+  /export/corpora/LDC/LDC2002S11/hub4e_97 data/local/data/eval97
+
+# Prepare 1998 HUB4 Broadcast News Evaluation English Test Material
+local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \
+  /export/corpora/LDC/LDC2000S86/ data/local/data/eval98
+
+# Prepare 1999 HUB4 Broadcast News Evaluation English Test Material
+local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \
+  /export/corpora5/LDC/LDC2000S88/hub4_1999 data/local/data/eval99
 
 local/format_data.sh 
 
@@ -36,9 +64,9 @@ local/prepare_dict.sh --dict-suffix "_nosp" \
 utils/prepare_lang.sh data/local/dict_nosp \
   "<unk>" data/local/lang_tmp_nosp data/lang_nosp
 
-local/format_lms.sh 
+local/format_lms.sh --local-lm-dir data/local/local_lm
 
-for x in train eval98 eval98.pem; do 
+for x in train dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem; do 
   this_nj=$(cat data/$x/utt2spk | wc -l)
   if [ $this_nj -gt 30 ]; then
     this_nj=30
@@ -50,6 +78,7 @@ for x in train eval98 eval98.pem; do
   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir
   utils/fix_data_dir.sh data/$x
 done
+}
 
 utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
 utils/subset_data_dir.sh data/train 2000 data/train_2k
@@ -80,11 +109,41 @@ steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
 
 utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
 
-steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
-  exp/tri3/graph_nosp data/eval98.pem exp/tri3/decode_nosp_eval98.pem
-steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test data/lang_nosp_test_rescore \
-  data/eval98.pem exp/tri3/decode_nosp_eval98.pem \
-  exp/tri3/decode_rescore_nosp_eval98.pem
+(
+for dset in eval96.pem eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri3/decode_nosp_${dset} \
+    exp/tri3/decode_nosp_${dset}_rescore
+done
+) &
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train data/lang_nosp exp/tri3 exp/tri3_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train data/lang_nosp exp/tri3_ali exp/tri4
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
+
+for dset in eval96.pem eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4/graph_nosp data/$dset exp/tri4/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4/decode_nosp_${dset} \
+    exp/tri4/decode_nosp_${dset}_rescore
+done
 
+wait
 exit 0

From cc9752c115d9cecc3d2dff254f9cd9dcf333c769 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 23 Mar 2017 17:37:57 -0400
Subject: [PATCH 12/38] bn: Minor fixes in BN recipe

---
 egs/bn/s5/local/run_cleanup_segmentation.sh | 29 ++++++++++++++-------
 egs/bn/s5/local/score_sclite.sh             | 26 +++++++++++++++---
 egs/bn/s5/path.sh                           |  6 +++--
 egs/bn/s5/run.sh                            |  7 +++--
 4 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/egs/bn/s5/local/run_cleanup_segmentation.sh b/egs/bn/s5/local/run_cleanup_segmentation.sh
index 0927b9f9a7d..2a56884446c 100755
--- a/egs/bn/s5/local/run_cleanup_segmentation.sh
+++ b/egs/bn/s5/local/run_cleanup_segmentation.sh
@@ -16,10 +16,6 @@
 # GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
 # [will add these later].
 
-set -e
-set -o pipefail
-set -u
-
 stage=0
 cleanup_stage=0
 data=data/train
@@ -31,6 +27,11 @@ decode_num_threads=4
 
 . ./path.sh
 . ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
 . utils/parse_options.sh
 
 cleaned_data=${data}_${cleanup_affix}
@@ -55,12 +56,16 @@ if [ $stage -le 3 ]; then
 fi
 
 if [ $stage -le 4 ]; then
-  # Test with the models trained on cleaned-up data.
+  # Test with the model trained on cleaned-up data.
   utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
 
-  for dset in eval98.pem; do
+  for dset in eval97.pem eval98.pem eval99_1.pem eval99_2.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt $decode_nj ]; then
+      this_nj=$decode_nj
+    fi
     steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
-       --cmd "$decode_cmd"  --num-threads 4 \
+       --cmd "$decode_cmd" \
        ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset}
     steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \
        data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore
@@ -80,12 +85,16 @@ fi
 
 cleaned_dir=exp/tri4b_${cleanup_affix}
 if [ $stage -le 7 ]; then
-  # Test with the models trained on cleaned-up data.
+  # Test with the larger model trained on cleaned-up data.
   utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
 
-  for dset in eval98.pem; do
+  for dset in eval97.pem eval98.pem eval99_1.pem eval99_2.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt $decode_nj ]; then
+      this_nj=$decode_nj
+    fi
     steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
-       --cmd "$decode_cmd"  --num-threads 4 \
+       --cmd "$decode_cmd"  \
        ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset}
     steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \
        data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore
diff --git a/egs/bn/s5/local/score_sclite.sh b/egs/bn/s5/local/score_sclite.sh
index 20045c2e96b..ae372b21f04 100755
--- a/egs/bn/s5/local/score_sclite.sh
+++ b/egs/bn/s5/local/score_sclite.sh
@@ -8,6 +8,7 @@ min_lmwt=5
 max_lmwt=17
 iter=final
 word_ins_penalty=0.0,0.5,1.0
+resolve_ctm_overlaps=false
 #end configuration section.
 
 [ -f ./path.sh ] && . ./path.sh
@@ -60,13 +61,30 @@ if [ $stage -le 0 ]; then
       lattice-1best ark:- ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
       nbest-to-ctm $frame_shift_opt ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt  \| \
-      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
-      '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+      utils/int2sym.pl -f 5 $lang/words.txt '>' \
+      $dir/score_LMWT_${wip}/$name.utt_ctm || exit 1;
   done
 fi
 
+utils/data/get_reco2utt.sh $data
 if [ $stage -le 1 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    if $resolve_ctm_overlaps; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/resolve_ctm_overlaps.LMWT.${wip}.log \
+        steps/resolve_ctm_overlaps.py $data/segments $data/reco2utt \
+          $dir/score_LMWT_${wip}/$name.utt_ctm - \| \
+        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/convert_ctm.LMWT.${wip}.log \
+        cat $dir/score_LMWT_${wip}/$name.utt_ctm \| \
+        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    fi
+  done
+fi
+
+if [ $stage -le 2 ]; then
   # Remove some stuff we don't want to score, from the ctm.
   # the big expression in parentheses contains all the things that get mapped
   # by the glm file, into hesitations.
@@ -83,7 +101,7 @@ if [ $stage -le 1 ]; then
 fi
 
 # Score the set...
-if [ $stage -le 2 ]; then
+if [ $stage -le 3 ]; then
   for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \
       cp $data/stm $dir/score_LMWT_${wip}/ '&&' \
diff --git a/egs/bn/s5/path.sh b/egs/bn/s5/path.sh
index da29adb7b2a..dc878dc9c45 100755
--- a/egs/bn/s5/path.sh
+++ b/egs/bn/s5/path.sh
@@ -1,6 +1,8 @@
-export KALDI_ROOT=`pwd`/../../..
+export KALDI_ROOT=/home/vmanoha1/kaldi-diarization-v2
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
-. $KALDI_ROOT/tools/env.sh
+export PATH=/home/vmanoha1/kaldi-diarization-v2/src/ivectorbin/:$PATH
+export PATH=/home/vmanoha1/kaldi-diarization-v2/src/segmenterbin/:$PATH
+export PATH=$KALDI_ROOT/tools/sctk/bin:$PATH
 export LC_ALL=C
diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh
index 6ed0a913b01..c151c693493 100755
--- a/egs/bn/s5/run.sh
+++ b/egs/bn/s5/run.sh
@@ -14,7 +14,6 @@ mfccdir=`pwd`/mfcc
 nj=40
 
 false && {
-
 # Prepare 1996 English Broadcast News Train (HUB4)
 local/data_prep/prepare_1996_bn_data.py --noise-word="<NOISE>" \
   --spoken-noise-word="<SPOKEN_NOISE>" \
@@ -78,7 +77,6 @@ for x in train dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98
   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir
   utils/fix_data_dir.sh data/$x
 done
-}
 
 utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
 utils/subset_data_dir.sh data/train 2000 data/train_2k
@@ -110,7 +108,7 @@ steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
 utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
 
 (
-for dset in eval96.pem eval97.pem; do
+for dset in eval97.pem; do
   this_nj=`cat data/$dset/spk2utt | wc -l`
   if [ $this_nj -gt 20 ]; then
     this_nj=20
@@ -131,8 +129,9 @@ steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
   data/train data/lang_nosp exp/tri3_ali exp/tri4
 
 utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
+}
 
-for dset in eval96.pem eval97.pem; do
+for dset in eval97.pem; do
   this_nj=`cat data/$dset/spk2utt | wc -l`
   if [ $this_nj -gt 20 ]; then
     this_nj=20

From 634d030bd88ff1533ef1a052c6b1e608cb11dadc Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 3 Nov 2017 12:49:07 -0400
Subject: [PATCH 13/38] HUB4 train preparation scripts

---
 .../s5/local/data_prep/format_1996_bn_data.pl | 131 +++++++++
 .../s5/local/data_prep/format_1997_bn_data.pl |   1 +
 .../data_prep/normalize_bn_transcript.py      |  43 +++
 egs/bn/s5/local/data_prep/parse_sgm.pl        | 275 ++++++++++++++++++
 .../local/data_prep/prepare_1996_bn_data.sh   |  44 +++
 .../local/data_prep/prepare_1997_bn_data.sh   |  44 +++
 .../prepare_na_news_text_supplement.sh        |   0
 7 files changed, 538 insertions(+)
 create mode 100755 egs/bn/s5/local/data_prep/format_1996_bn_data.pl
 create mode 120000 egs/bn/s5/local/data_prep/format_1997_bn_data.pl
 create mode 100755 egs/bn/s5/local/data_prep/normalize_bn_transcript.py
 create mode 100755 egs/bn/s5/local/data_prep/parse_sgm.pl
 create mode 100755 egs/bn/s5/local/data_prep/prepare_1996_bn_data.sh
 create mode 100755 egs/bn/s5/local/data_prep/prepare_1997_bn_data.sh
 mode change 100644 => 100755 egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh

diff --git a/egs/bn/s5/local/data_prep/format_1996_bn_data.pl b/egs/bn/s5/local/data_prep/format_1996_bn_data.pl
new file mode 100755
index 00000000000..84913e9a8b0
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/format_1996_bn_data.pl
@@ -0,0 +1,131 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University 
+#                        (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+use List::Util qw(max);
+
+my $audio_width=1;
+my $speaker_width=1;
+my $time_width=1;
+
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+if (@ARGV != 3) {
+  print STDERR "$0: Error: Unsupported number of arguments: " . scalar @ARGV ."\n";
+  print STDERR "  Usage: $0 <audio-files> <transripts> <destination>\n";
+  print STDERR "  where\n";
+  print STDERR "    <audio-files> is a file containing list of audio files\n";
+  print STDERR "      (single absolute path name per line)\n";
+  print STDERR "    <transcripts> is a file containing transcripts obtained\n";
+  print STDERR "      obtained by processing the official SGML format\n";
+  print STDERR "      transcripts. See parse_sgm.pl for further info.\n";
+  print STDERR "    <destination> target directory (should already exist)\n";
+  print STDERR "  See also: local/parse_sgm.pl\n";
+  die;
+}
+
+my $audio_files = $ARGV[0];
+my $transcripts = $ARGV[1];
+my $out = $ARGV[2];
+
+my %AUDIO;
+open(my $audio_f, "<", $audio_files) 
+  or die "$0: Error: Could not open $audio_files: $!\n";
+while(my $line = <$audio_f>) {
+  chomp $line;
+  (my $basename = $line) =~ s/.*\/([^\/]+).sph/$1/g;
+  $basename =~ s/_$//g;
+  $AUDIO{$basename} = $line;
+}
+close($audio_f);
+
+my %TRANSCRIPT;
+open(my $transcript_f, "<:encoding(utf-8)", $transcripts)
+  or die "$0: Error: Could not open $transcripts: $!\n";
+while(my $line = <$transcript_f>) {
+  chomp $line;
+  my @F = split / /, $line, 8;
+  push @{$TRANSCRIPT{$F[0]}}, \@F;
+
+  my $f1 = $F[0];
+  my $f2 = $F[1];
+  my $speaker = $F[2];
+  my $t1 = $F[5];
+  my $t2 = $F[6];
+
+  $time_width = max $time_width, length($t1), length($t2);
+  $speaker_width = max $speaker_width, length($speaker);
+  $audio_width = max $audio_width, length($f1);
+}
+close($transcript_f);
+#print Dumper(\%TRANSCRIPT);
+
+print STDERR $time_width . " " . $speaker_width . " " . $audio_width . "\n";
+
+my $sph2pipe = `which sph2pipe` or do {
+  die "$0: Error: sph2pipe is not installed. Did you run make in the tools/ directory?\n";
+};
+chomp $sph2pipe;
+
+open(my $wav_file, ">", "$out/wav.scp") 
+  or die "$0: Error: Cannot create file $out/wav.scp: $!\n";
+open(my $text_file, ">:encoding(utf-8)", "$out/text") 
+  or die "$0: Error: Cannot create file $out/text: $!\n";
+open(my $segments_file, ">", "$out/segments") 
+  or die "$0: Error: Cannot create file $out/segments: $!\n";
+open(my $spk_file, ">", "$out/utt2spk") 
+  or die "$0: Error: Cannot create file $out/utt2spk: $!\n";
+
+foreach my $file (sort keys %AUDIO) {
+  print "$0 Error: $file does not exist in transcripts!\n"  
+    unless exists $TRANSCRIPT{$file};
+  my $transcripts = $TRANSCRIPT{$file};
+
+  #my $file_fmt = sprintf("%0${audio_width}s", $file);
+  my $file_fmt = sprintf("%s", $file);
+
+  print $wav_file "$file_fmt $sph2pipe -f wav $AUDIO{$file}|\n";
+
+  foreach my $utt (@{$transcripts}) {
+    my $start = $utt->[5] + 0.0;  
+    my $end = $utt->[6] + 0.0;
+    if ($end - $start < 0.005) {   # remove very short segments
+      next;
+    }
+    my $start_time = sprintf("%0${time_width}d", $utt->[5]*1000);  
+    my $end_time = sprintf("%0${time_width}d", $utt->[6]*1000);
+    my $spk = sprintf("%0${speaker_width}s", $utt->[2]);
+    # my $spk = sprintf("%s", $utt->[2]);
+    my $spkid = "${file_fmt}_${spk}";
+    my $uttid = "${file_fmt}_${spk}_${start_time}_${end_time}";
+
+    print $text_file "$uttid $utt->[7]\n";
+    print $spk_file "$uttid $spkid\n";
+    print $segments_file "$uttid $file_fmt $start $end\n";
+  }
+}
+
+close($wav_file);
+close($text_file);
+close($segments_file);
+close($spk_file);
diff --git a/egs/bn/s5/local/data_prep/format_1997_bn_data.pl b/egs/bn/s5/local/data_prep/format_1997_bn_data.pl
new file mode 120000
index 00000000000..844c16bbe06
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/format_1997_bn_data.pl
@@ -0,0 +1 @@
+format_1996_bn_data.pl
\ No newline at end of file
diff --git a/egs/bn/s5/local/data_prep/normalize_bn_transcript.py b/egs/bn/s5/local/data_prep/normalize_bn_transcript.py
new file mode 100755
index 00000000000..1f7367438f4
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/normalize_bn_transcript.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import re, sys
+
+def main():
+    if len(sys.argv) != 3:
+        sys.stderr.write("{0} <noise-word> <spoken-noise-word> "
+                         "< text_file > out_text_file\n".format(sys.argv[0]))
+        sys.exit(1)
+
+    noise_word = sys.argv[1]
+    spoken_noise_word = sys.argv[2]
+
+    for line in sys.stdin.readlines():
+        parts = line.strip().split()
+        normalized_text = normalize_bn_transcript(
+            ' '.join(parts[1:]), noise_word, spoken_noise_word)
+        print ("{0} {1}".format(parts[0], normalized_text))
+
+
+def normalize_bn_transcript(text, noise_word, spoken_noise_word):
+    """Normalize broadcast news transcript for audio."""
+    text = text.upper()
+    # Remove unclear speech markings
+    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
+    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
+    # Remove invented word markings
+    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
+    text = re.sub(r"\[[^]]+\]", noise_word, text)
+    text = re.sub(r"\{[^}]+\}", spoken_noise_word, text)
+    # Remove mispronunciation brackets
+    text = re.sub(r"\+([^+]+)\+", r"\1", text)
+
+    text1 = []
+    for word in text.split():
+        # Remove best guesses for proper nouns
+        word = re.sub(r"^@(\w+)$", r"\1", word)
+        text1.append(word)
+    return " ".join(text1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/bn/s5/local/data_prep/parse_sgm.pl b/egs/bn/s5/local/data_prep/parse_sgm.pl
new file mode 100755
index 00000000000..ad2964cc2f0
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/parse_sgm.pl
@@ -0,0 +1,275 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<segment/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'s_time'};
+      $segment_end = $tags{'e_time'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["segment", \%tags];
+      ;
+    } elsif ($line =~ /<\/segment/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<turn/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'starttime'};
+      $segment_end = $tags{'endtime'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["turn", \%tags];
+      ;
+    } elsif ($line =~ /<\/turn/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<sync/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'time'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/sync/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<time/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'sec'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/time/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+
+  }
+  close($f);
+}
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_bn_data.sh b/egs/bn/s5/local/data_prep/prepare_1996_bn_data.sh
new file mode 100755
index 00000000000..3decb3a268b
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1996_bn_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1996 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC97S44 
+# /export/corpora/LDC/LDC97T22
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data data/local/data/train_bn96"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+speech_source_dir=$2  # /export/corpora/LDC/LDC97S44/data
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/*/*.txt > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/data_prep/parse_sgm.pl $out/text.list > $out/transcripts.txt 2> \
+  $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1996 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/bn/s5/local/data_prep/prepare_1997_bn_data.sh b/egs/bn/s5/local/data_prep/prepare_1997_bn_data.sh
new file mode 100755
index 00000000000..58e8a4b2eef
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/prepare_1997_bn_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1997 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC98S71 
+# /export/corpora/LDC/LDC98T28
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 data/local/data/train_bn97"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+speech_source_dir=$2  # /export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/transcrp/*.sgml > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/data_prep/parse_sgm.pl $out/text.list > $out/transcripts.txt 2> \
+  $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1997 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh
old mode 100644
new mode 100755

From e6242ed3f89d83037ca51a5bf031a231d12a9185 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 3 Nov 2017 13:31:53 -0400
Subject: [PATCH 14/38] bn: Bug fixes and create new scripts

---
 egs/bn/s5/local/format_data.sh           | 28 +++++++++++++++++++++++-
 egs/bn/s5/local/normalize_transcripts.pl | 27 ++++-------------------
 egs/bn/s5/local/train_lm.sh              | 10 ++++-----
 egs/bn/s5/run.sh                         | 12 +++++++---
 4 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/egs/bn/s5/local/format_data.sh b/egs/bn/s5/local/format_data.sh
index 3f5bb29195d..73e58c675d9 100755
--- a/egs/bn/s5/local/format_data.sh
+++ b/egs/bn/s5/local/format_data.sh
@@ -4,6 +4,10 @@
 # Apache 2.0.
 
 echo "$0 $@"  # Print the command line for logging
+
+noise_word="<NOISE>"
+spoken_noise_word="<SPOKEN_NOISE>"
+
 . utils/parse_options.sh || exit 1;
 
 . ./path.sh || exit 1;
@@ -11,11 +15,33 @@ echo "$0 $@"  # Print the command line for logging
 srcdir=data/local/data
 tmpdir=data/local/
 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+
 ###############################################################################
 # Format 1996 English Broadcast News Train (HUB4)
 ###############################################################################
 mkdir -p data/train_bn96
-cp $srcdir/train_bn96/{wav.scp,segments,utt2spk} data/train_bn96
+
+local/data_prep/format_1996_bn_data.pl \
+  $srcdir/train_bn96/audio.list $srcdir/train_bn96/transcript.txt \
+  data/train_bn96 || exit 1
+
+mv data/train_bn96/text data/train_bn96/text.unnorm
+local/normalize_transcripts.pl $noise_word $spoken_noise_word \
+  < data/train_bn96/text.unnorm > data/train_bn96/text
+
+###############################################################################
+# Format 1997 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/train_bn97
+
+local/data_prep/format_1997_bn_data.pl \
+  $srcdir/train_bn97/audio.list $srcdir/train_bn97/transcript.txt \
+  data/train_bn97 || exit 1
+
+mv data/train_bn97/text data/train_bn97/text.unnorm
+local/normalize_transcripts.pl $noise_word $spoken_noise_word \
+  < data/train_bn97/text.unnorm > data/train_bn97/text
 
 ###############################################################################
 # Format 1996 English Broadcast News Dev (HUB4)
diff --git a/egs/bn/s5/local/normalize_transcripts.pl b/egs/bn/s5/local/normalize_transcripts.pl
index 069476cbc37..3359520ace9 100755
--- a/egs/bn/s5/local/normalize_transcripts.pl
+++ b/egs/bn/s5/local/normalize_transcripts.pl
@@ -1,27 +1,9 @@
 #!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
 
-
-# This takes data from the standard input that's unnormalized transcripts in the format
-# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] 
-# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] 
-# and outputs normalized transcripts.
-# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
-
-@ARGV == 2 ||  die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
+@ARGV == 2 ||  die "usage: normalize_transcript.pl noise_word spoken_noise_word < transcript > transcript2";
 $noise_word = shift @ARGV;
 $spoken_noise_word = shift @ARGV;
 
@@ -39,9 +21,8 @@
     $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
     $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
     foreach $w (split (" ",$trans)) {
-        $w =~ s:^@(.*)$:$1:;  # Remove best guesses for proper nouns
+        $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
         print " $w";
     }
     print "\n";
 }
-
diff --git a/egs/bn/s5/local/train_lm.sh b/egs/bn/s5/local/train_lm.sh
index 6522619cf77..19987b906a7 100755
--- a/egs/bn/s5/local/train_lm.sh
+++ b/egs/bn/s5/local/train_lm.sh
@@ -51,12 +51,12 @@ if [ $stage -le 0 ]; then
   # Take unique subset to make sure that the training text is not in the 
   # dev set.
   cat data/train_bn96/text | cut -d ' ' -f 2- | sort | uniq -c | \
-    shuf > ${dir}/train_bn96_text
-  head -n $num_dev_sentences < ${dir}/train_bn96_text | \
-    awk '{str=$2; for (i=3;i<=NF;i++) {str = str" "$i;}; for (i=0; i<$1; i++) {print str;} }' | cut -d ' ' -f 2- > \
+    shuf > ${dir}/train_bn96_text_with_count
+  head -n $num_dev_sentences < ${dir}/train_bn96_text_with_count | \
+    awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
     ${dir}/data/text/dev.txt 
-  tail -n +$[num_dev_sentences+1] < ${dir}/train_bn96_text | \
-    awk '{str=$2; for (i=3;i<=NF;i++) {str = str" "$i;}; for (i=0; i<$1; i++) {print str;} }' | cut -d ' ' -f 2- > \
+  tail -n +$[num_dev_sentences+1] < ${dir}/train_bn96_text_with_count | \
+    awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
     ${dir}/data/text/train_bn96.txt
 
   # Get text from NA News corpus 
diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh
index c151c693493..4614efd328f 100755
--- a/egs/bn/s5/run.sh
+++ b/egs/bn/s5/run.sh
@@ -15,11 +15,17 @@ nj=40
 
 false && {
 # Prepare 1996 English Broadcast News Train (HUB4)
-local/data_prep/prepare_1996_bn_data.py --noise-word="<NOISE>" \
-  --spoken-noise-word="<SPOKEN_NOISE>" \
-  /export/corpora/LDC/LDC97S44 /export/corpora/LDC/LDC97T22 \
+local/data_prep/prepare_1996_bn_data.sh \
+  /export/corpora/LDC/LDC97T22/hub4_eng_train_trans \
+  /export/corpora/LDC/LDC97S44 \
   data/local/data/train_bn96
 
+# Prepare 1997 English Broadcast News Train (HUB4)
+local/data_prep/prepare_1997_bn_data.sh \
+  /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 \
+  /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 \
+  data/local/data/train_bn97
+
 # Prepare 1995 CSR-IV HUB4 corpus
 local/data_prep/prepare_1995_csr_hub4_corpus.sh \
   /export/corpora5/LDC/LDC96S31/csr95_hub4/ data/local/data/csr95_hub4

From 560929c58d24c921657b90895418e6d442b0657a Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 3 Nov 2017 13:37:43 -0400
Subject: [PATCH 15/38] Minor modifications

---
 .../steps/cleanup/internal/align_ctm_ref.py   | 12 +++++++----
 egs/wsj/s5/utils/data/get_utt2dur.sh          | 20 ++++++++++++++-----
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
index e940183497b..848ca61ebe4 100755
--- a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
+++ b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
@@ -488,20 +488,24 @@ def ctm_line_to_string(ctm_line):
     return " ".join([str(x) for x in ctm_line])
 
 
-def test_alignment():
-    hyp = "ACACACTA"
+def test_alignment(align_full_hyp):
+    hyp = "GCCAT"
     ref = "AGCACACA"
 
+    verbose = 3
+    logger.info("REF: %s", ref)
+    logger.info("HYP: %s", hyp)
+
     output, score = smith_waterman_alignment(
         ref, hyp, similarity_score_function=lambda x, y: 2 if (x == y) else -1,
-        del_score=-1, ins_score=-1, eps_symbol="-", align_full_hyp=True)
+        del_score=-1, ins_score=-1, eps_symbol="-", align_full_hyp=align_full_hyp)
 
     print_alignment("Alignment", output, out_file_handle=sys.stderr)
 
 
 def run(args):
     if args.debug_only:
-        test_alignment()
+        test_alignment(args.align_full_hyp)
         raise SystemExit("Exiting since --debug-only was true")
 
     def similarity_score_function(x, y):
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
index c415e8dfb81..29ba86bf385 100755
--- a/egs/wsj/s5/utils/data/get_utt2dur.sh
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -11,6 +11,8 @@
 # files in entirely.)
 
 frame_shift=0.01
+cmd=run.pl
+nj=4
 
 . utils/parse_options.sh
 . ./path.sh
@@ -80,11 +82,19 @@ elif [ -f $data/wav.scp ]; then
       echo "... perturb_data_dir_speed_3way.sh."
     fi
 
-    if ! wav-to-duration --read-entire-file=$read_entire_file scp:$data/wav.scp ark,t:$data/utt2dur 2>&1 | grep -v 'nonzero return status'; then
-      echo "$0: there was a problem getting the durations; moving $data/utt2dur to $data/.backup/"
-      mkdir -p $data/.backup/
-      mv $data/utt2dur $data/.backup/
-    fi
+    utils/data/split_data.sh --per-utt $data $nj
+    sdata=$data/split${nj}utt
+
+    $cmd JOB=1:$nj $data/log/get_durations.JOB.log \
+      wav-to-duration --read-entire-file=$read_entire_file scp:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || 
+      {
+        echo "$0: there was a problem getting the durations";
+        exit 1;
+      }
+
+    for n in `seq $nj`; do
+      cat $sdata/$n/utt2dur
+    done > $data/utt2dur
   fi
 elif [ -f $data/feats.scp ]; then
   echo "$0: wave file does not exist so getting durations from feats files"

From e1dd79e75386189d89e19f5463dfafde530b653f Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 7 Nov 2017 17:58:01 -0500
Subject: [PATCH 16/38] bn: Modifying some preparation scripts

---
 egs/bn/s5/local/data_prep/hub4_utils.py       |  21 --
 .../data_prep/normalize_bn_transcript.py      |  43 ----
 ...arse_sgm.pl => parse_sgm_1996_hub4_eng.pl} |  46 ----
 .../data_prep/parse_sgm_1997_hub4_eng.pl      | 228 +++++++++++++++++
 .../data_prep/prepare_1995_csr_hub4_corpus.sh |  10 +
 .../local/data_prep/prepare_1996_bn_data.py   | 242 ------------------
 .../local/data_prep/prepare_1997_bn_data.py   |   2 -
 7 files changed, 238 insertions(+), 354 deletions(-)
 delete mode 100755 egs/bn/s5/local/data_prep/normalize_bn_transcript.py
 rename egs/bn/s5/local/data_prep/{parse_sgm.pl => parse_sgm_1996_hub4_eng.pl} (81%)
 create mode 100644 egs/bn/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
 delete mode 100755 egs/bn/s5/local/data_prep/prepare_1996_bn_data.py
 delete mode 100755 egs/bn/s5/local/data_prep/prepare_1997_bn_data.py

diff --git a/egs/bn/s5/local/data_prep/hub4_utils.py b/egs/bn/s5/local/data_prep/hub4_utils.py
index b43de80c73b..afb92b6dbb1 100644
--- a/egs/bn/s5/local/data_prep/hub4_utils.py
+++ b/egs/bn/s5/local/data_prep/hub4_utils.py
@@ -79,27 +79,6 @@ def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
     return (segment_line, utt2spk_line)
 
 
-def normalize_bn_transcript(text, noise_word, spoken_noise_word):
-    """Normalize broadcast news transcript for audio."""
-    text = text.upper()
-    # Remove unclear speech markings
-    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
-    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
-    # Remove invented word markings
-    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
-    text = re.sub(r"\[[^]]+\]", noise_word, text)
-    text = re.sub(r"\{[^}]+\}", spoken_noise_word, text)
-    # Remove mispronunciation brackets
-    text = re.sub(r"\+([^+]+)\+", r"\1", text)
-
-    text1 = []
-    for word in text.split():
-        # Remove best guesses for proper nouns
-        word = re.sub(r"^@(\w+)$", r"\1", word)
-        text1.append(word)
-    return " ".join(text1)
-
-
 def normalize_csr_transcript(text, noise_word, spoken_noise_word):
     """Normalize broadcast news transcript for audio."""
     text = text.upper()
diff --git a/egs/bn/s5/local/data_prep/normalize_bn_transcript.py b/egs/bn/s5/local/data_prep/normalize_bn_transcript.py
deleted file mode 100755
index 1f7367438f4..00000000000
--- a/egs/bn/s5/local/data_prep/normalize_bn_transcript.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python
-
-import re, sys
-
-def main():
-    if len(sys.argv) != 3:
-        sys.stderr.write("{0} <noise-word> <spoken-noise-word> "
-                         "< text_file > out_text_file\n".format(sys.argv[0]))
-        sys.exit(1)
-
-    noise_word = sys.argv[1]
-    spoken_noise_word = sys.argv[2]
-
-    for line in sys.stdin.readlines():
-        parts = line.strip().split()
-        normalized_text = normalize_bn_transcript(
-            ' '.join(parts[1:]), noise_word, spoken_noise_word)
-        print ("{0} {1}".format(parts[0], normalized_text))
-
-
-def normalize_bn_transcript(text, noise_word, spoken_noise_word):
-    """Normalize broadcast news transcript for audio."""
-    text = text.upper()
-    # Remove unclear speech markings
-    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
-    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
-    # Remove invented word markings
-    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
-    text = re.sub(r"\[[^]]+\]", noise_word, text)
-    text = re.sub(r"\{[^}]+\}", spoken_noise_word, text)
-    # Remove mispronunciation brackets
-    text = re.sub(r"\+([^+]+)\+", r"\1", text)
-
-    text1 = []
-    for word in text.split():
-        # Remove best guesses for proper nouns
-        word = re.sub(r"^@(\w+)$", r"\1", word)
-        text1.append(word)
-    return " ".join(text1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/bn/s5/local/data_prep/parse_sgm.pl b/egs/bn/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
similarity index 81%
rename from egs/bn/s5/local/data_prep/parse_sgm.pl
rename to egs/bn/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
index ad2964cc2f0..37487296809 100755
--- a/egs/bn/s5/local/data_prep/parse_sgm.pl
+++ b/egs/bn/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
@@ -163,39 +163,6 @@ sub parse_sgml_tag {
       $line = trim $line;
       die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
 
-      #print join(" ", @text) . "\n" if @text > 0;
-      my $new_time = $segment_end;
-      if (@text > 0) {
-        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
-        print join(" ", @text) . "\n";
-      }
-      @text = ();
-      $time = 0;
-      $segment_speaker = "XXX";
-      $segment_start = "XXX";
-      $segment_end = "XXX";
-      $segment_fidelity = "XXX";
-      $segment_mode = "XXX";
-      #print "ET: $line\n";
-      ;
-    } elsif ($line =~ /<turn/) { 
-      #print "BT: $line\n";
-      my %tags = parse_sgml_tag $line;
-      $segment_speaker = $tags{'speaker'};
-      $segment_speaker =~ s/"//g;
-      $segment_start = $tags{'starttime'};
-      $segment_end = $tags{'endtime'};
-      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
-      $segment_mode = $tags{'mode'} if $tags{'mode'};
-      $time = $segment_start;
-      push @tagqueue, ["turn", \%tags];
-      ;
-    } elsif ($line =~ /<\/turn/) {
-      my $p = pop @tagqueue;
-      $line =~ s/<\/(.*)( +.*)?>/$1/g;
-      $line = trim $line;
-      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
-
       #print join(" ", @text) . "\n" if @text > 0;
       my $new_time = $segment_end;
       if (@text > 0) {
@@ -224,19 +191,6 @@ sub parse_sgml_tag {
     } elsif ($line =~ /<\/sync/) {
       #print $line;
       ;
-    } elsif ($line =~ /<time/) {
-      my %tags = parse_sgml_tag $line;
-      my $new_time = $tags{'sec'};
-      if (@text > 0) {
-        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
-        print join(" ", @text) . "\n";
-      }
-      @text = ();
-      $time = $new_time;
-      ;
-    } elsif ($line =~ /<\/time/) {
-      #print $line;
-      ;
     } elsif ($line =~ /<overlap/) {
       #print $line;
       ;
diff --git a/egs/bn/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl b/egs/bn/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
new file mode 100644
index 00000000000..fe5ea13779f
--- /dev/null
+++ b/egs/bn/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
@@ -0,0 +1,228 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<turn/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'starttime'};
+      $segment_end = $tags{'endtime'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["turn", \%tags];
+      ;
+    } elsif ($line =~ /<\/turn/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<time/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'sec'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/time/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+  }
+  close($f);
+}
diff --git a/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh b/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
index b199fdc8a48..92215b7f058 100755
--- a/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
+++ b/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
@@ -41,6 +41,16 @@ cat $dir/wav_scp | grep "csr95/h4/evltst" > $dir/eval95_wav_scp
 
 rm $dir/*_{segments,utt2spk,text} || true
 
+ls $SOURCE_DIR/csr95/h4/train/*.txt > $dir/train95_text.list
+ls $SOURCE_DIR/csr95/h4/devtst/*.txt > $dir/dev95_text.list
+ls $SOURCE_DIR/csr95/h4/evltst/*.txt > $dir/eval95_text.list
+
+#local/data_prep/parse_sgm_1995_csr_hub4.pl $dir/train95_text.list > $dir/train95_transcripts.txt 2> $dir/parse_sgml_train95.log || exit 1
+#local/data_prep/parse_sgm_1995_csr_hub4.pl $dir/dev95_text.list > $dir/dev95_transcripts.txt 2> $dir/parse_sgml_dev95.log || exit 1
+#local/data_prep/parse_sgm_1995_csr_hub4.pl $dir/eval95_test.list > $dir/eval95_transcripts.txt 2> $dir/parse_sgml_eval95.log || exit 1
+#
+#exit 0
+
 for x in `ls $SOURCE_DIR/csr95/h4/*/*.txt`; do
   if [[ $x =~ "csr95/h4/train" ]]; then
     local/data_prep/process_1995_bn_annotation.py $x \
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_bn_data.py b/egs/bn/s5/local/data_prep/prepare_1996_bn_data.py
deleted file mode 100755
index 26bc69f572b..00000000000
--- a/egs/bn/s5/local/data_prep/prepare_1996_bn_data.py
+++ /dev/null
@@ -1,242 +0,0 @@
-#! /usr/bin/env python
-
-# Copyright 2016    Vimal Manohar
-# Apache 2.0.
-
-"""This script prepares the 1996 English Broadcast News (HUB4) corpus.
-https://catalog.ldc.upenn.edu/LDC97S44
-https://catalog.ldc.upenn.edu/LDC97T22
-"""
-
-from __future__ import print_function
-import argparse
-import glob
-import logging
-import os
-import re
-from bs4 import BeautifulSoup
-import hub4_utils
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-
-def get_args():
-    parser = argparse.ArgumentParser("Prepare BN corpus.")
-    parser.add_argument("--noise-word", type=str, default="<NOISE>",
-                        help="""Replace all noise words in transcript
-                        with this noise_word""")
-    parser.add_argument("--spoken-noise-word", type=str,
-                        default="<SPOKEN_NOISE>",
-                        help="""Replace all speaker noise words in transcript
-                        with this spoken_noise_word""")
-    parser.add_argument("--split-at-sync", type=str,
-                        choices=["true", "false"], default="false",
-                        help="If true, creates separate segments split "
-                        "at each sync tag.")
-    parser.add_argument("audio_source_dir", type=str,
-                        help="Source directory of audio of BN corpus "
-                        "(LDC97S44)")
-    parser.add_argument("text_source_dir", type=str,
-                        help="Source directory of text of BN corpus "
-                        "(LDC97T22)")
-    parser.add_argument("dir", type=str,
-                        help="Output directory to write the kaldi files")
-
-    args = parser.parse_args()
-
-    args.split_at_sync = bool(args.split_at_sync == "true")
-    return args
-
-
-class Segment(object):
-    """A class to store a segment with start time, end time, recording id,
-    speaker, and the text.
-    """
-    def __init__(self, reco_id, speaker=None):
-        self.reco_id = reco_id
-        self.text = None
-        self.start_time = -1
-        self.end_time = -1
-        self.speaker = speaker
-
-    def write_segment(self, out_file):
-        """writes segment in kaldi segments format"""
-        print("{0} {1} {2} {3}".format(self.get_utt_id(), self.reco_id,
-                                       self.start_time, self.end_time),
-              file=out_file)
-
-    def write_utt2spk(self, out_file):
-        """writes speaker information in kaldi utt2spk format"""
-        print("{0} {1}".format(self.get_utt_id(), self.get_spk_id()),
-              file=out_file)
-
-    def write_text(self, out_file, noise_word="<NOISE>",
-                   spoken_noise_word="<SPOKEN_NOISE_WORD>"):
-        text = hub4_utils.normalize_bn_transcript(
-            self.text, noise_word, spoken_noise_word)
-        if len(text) == 0 or re.match(r"^\s*$", text):
-            return
-        print("{0} {1}".format(self.get_utt_id(), text), file=out_file)
-
-    def check(self):
-        """checks if this is a valid segment"""
-        assert self.end_time > self.start_time
-
-    def get_utt_id(self):
-        """returns the utterance id created from the recording id and
-        the timing information"""
-        if self.speaker is None:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.reco_id, int(self.start_time * 100),
-                int(self.end_time * 100)))
-        else:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.get_spk_id(), int(self.start_time * 100),
-                int(self.end_time * 100)))
-
-    def get_spk_id(self):
-        if self.speaker is None:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.reco_id, int(self.start_time * 100),
-                int(self.end_time * 100)))
-        return "{0}-{1}".format(self.reco_id, self.speaker)
-
-    def duration(self):
-        """returns the duration of the segment"""
-        return self.end_time - self.start_time
-
-
-def process_segment_soup(reco_id, soup, split_at_sync=False):
-    """Processes the input segment soup into a list of objects of class
-    Segment.
-    If split_at_sync is False, then only a segment is created for the soup
-    without consideration to the sync tags.
-    """
-    start_time = float(soup['s_time'])
-    end_time = float(soup['e_time'])
-    speaker = soup['speaker']
-
-    segments = []
-
-    create_new_segment = True
-    for x in soup.children:
-        try:
-            if x.name == "sync":
-                assert not create_new_segment
-                if not split_at_sync:
-                    continue
-                start_time = float(x['time'])
-                segments[-1].end_time = start_time
-                create_new_segment = True
-            elif x.name == "background" or x.name == "comment":
-                continue
-            else:
-                if create_new_segment:
-                    assert split_at_sync or len(segments) == 0
-                    segment = Segment(reco_id, speaker)
-                    segment.text = x.encode('ascii').strip().replace('\n', ' ')
-                    segment.start_time = start_time
-                    segment.end_time = end_time
-                    if segment.duration() > 0:
-                        segments.append(segment)
-                    create_new_segment = False
-                else:
-                    segments[-1].text += (
-                        ' ' + x.encode('ascii').strip().replace('\n', ' '))
-        except Exception:
-            logger.error("Error processing element %s", x)
-            raise
-
-    return segments
-
-
-def process_transcription(transcription_file, segments_handle, utt2spk_handle,
-                          text_handle, split_at_sync=False,
-                          noise_word="<NOISE>",
-                          spoken_noise_word="<SPOKEN_NOISE>"):
-    """Processes transcription file into segments."""
-    doc = ''.join(open(transcription_file).readlines())
-    tag_matcher = re.compile(r"(<(Sync|Background)[^>]+>)")
-    doc_modified = tag_matcher.sub(r"\1</\2>", doc)
-
-    soup = BeautifulSoup(doc_modified, 'lxml')
-
-    reco_id, ext = os.path.splitext(os.path.basename(transcription_file))
-    reco_id = reco_id.strip('_')  # remove trailing underscores in the name
-
-    for episode in soup.find_all("episode"):
-        for section in episode.find_all("section"):
-            s_time = section['s_time']
-            e_time = section['e_time']
-            section_type = section['type']
-
-            logger.debug("Processing section st = %d, end = %d, "
-                         "type = %s", s_time, e_time, section_type)
-
-            for seg in section.find_all("segment"):
-                try:
-                    segments = process_segment_soup(
-                        reco_id, seg, split_at_sync=split_at_sync)
-                    for s in segments:
-                        if s.duration() == 0:
-                            continue
-                        s.write_segment(segments_handle)
-                        s.write_utt2spk(utt2spk_handle)
-                        s.write_text(text_handle, noise_word,
-                                     spoken_noise_word)
-                except Exception:
-                    logger.error("Failed processing segment %s", seg)
-                    raise
-
-
-def run(args):
-    if not os.path.isdir(args.dir):
-        os.makedirs(args.dir)
-
-    with open(os.path.join(args.dir, "wav.scp"), 'w') as wav_scp_handle:
-        for file_ in glob.glob("{0}/{1}/*.sph".format(args.audio_source_dir,
-                                                      "data")):
-            reco, ext = os.path.splitext(os.path.basename(file_))
-            reco = reco.strip('_')
-
-            print("{0} sox {1} -c 1 -r 16000 -t wav - |".format(
-                reco, file_), file=wav_scp_handle)
-
-    segments_handle = open(os.path.join(args.dir, "segments"), 'w')
-    utt2spk_handle = open(os.path.join(args.dir, "utt2spk"), 'w')
-    text_handle = open(os.path.join(args.dir, "text"), 'w')
-    for dir_ in glob.glob("{0}/{1}/*/".format(args.text_source_dir,
-                                              "hub4_eng_train_trans")):
-        for x in glob.glob("{0}/*.txt".format(dir_)):
-            try:
-                process_transcription(x, segments_handle, utt2spk_handle,
-                                      text_handle,
-                                      split_at_sync=args.split_at_sync,
-                                      noise_word=args.noise_word,
-                                      spoken_noise_word=args.spoken_noise_word)
-            except Exception:
-                logger.error("Failed to process file %s",
-                             x)
-                raise
-    segments_handle.close()
-    utt2spk_handle.close()
-    text_handle.close()
-
-
-def main():
-    try:
-        args = get_args()
-        run(args)
-    except Exception:
-        raise
-
-
-if __name__ == '__main__':
-    main()
diff --git a/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py b/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py
deleted file mode 100755
index 0dd9b4dca58..00000000000
--- a/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py
+++ /dev/null
@@ -1,2 +0,0 @@
-
-/export/corpora/LDC/LDC98T28

From 35c982004da3238b9a77495843b74d66d4eead96 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 8 Nov 2017 15:44:12 -0500
Subject: [PATCH 17/38] bn: Updating bn recipe

---
 egs/bn/s5/README                              |  33 +-
 .../s5/local/tuning/run_segmentation_wsj_a.sh | 420 +++++++++++++++++
 .../s5/local/tuning/run_segmentation_wsj_b.sh | 346 ++++++++++++++
 .../s5/local/tuning/run_segmentation_wsj_d.sh | 425 ++++++++++++++++++
 .../s5/local/tuning/run_segmentation_wsj_e.sh | 421 +++++++++++++++++
 .../s5/local/tuning/run_segmentation_wsj_f.sh | 421 +++++++++++++++++
 egs/bn/s5/run.sh                              |   2 -
 7 files changed, 2063 insertions(+), 5 deletions(-)
 create mode 100755 egs/bn/s5/local/tuning/run_segmentation_wsj_a.sh
 create mode 100755 egs/bn/s5/local/tuning/run_segmentation_wsj_b.sh
 create mode 100755 egs/bn/s5/local/tuning/run_segmentation_wsj_d.sh
 create mode 100755 egs/bn/s5/local/tuning/run_segmentation_wsj_e.sh
 create mode 100755 egs/bn/s5/local/tuning/run_segmentation_wsj_f.sh

diff --git a/egs/bn/s5/README b/egs/bn/s5/README
index 8a8ae65108d..7db319fe174 100644
--- a/egs/bn/s5/README
+++ b/egs/bn/s5/README
@@ -1,6 +1,33 @@
- The MUSAN corpus is required for system training. It is available at: 
-   http://www.openslr.org/17/
+This is the English Broadcast News (HUB4) corpus.
 
- The test requires Broadcast News data. The LDC Catalog numbers are:
+1996 English Broadcast News Train (HUB4)
    Speech      LDC97S44
    Transcripts LDC97T22
+
+1997 English Broadcast News Train (HUB4)
+  Speech       LDC98S71
+  Transcripts  LDC98T28
+
+1995 English Broadcast News (CSR-IV HUB4)
+  LDC96S31
+
+North American News Text Corpus
+  LDC95T21
+
+North American News Text Supplement Corpus
+  LDC98T30
+
+1996 CSR HUB4 Language Model
+  LDC98T31
+
+1996 English Broadcast News Dev and Eval (HUB4) 
+  LDC97S66
+
+1997 HUB4 English Evaluation corpus
+  LDC2002S11
+ 
+1998 HUB4 Broadcast News Evaluation English Test Material
+  LDC2000S86 
+
+1999 HUB4 Broadcast News Evaluation English Test Material
+  LDC2000S88
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/bn/s5/local/tuning/run_segmentation_wsj_a.sh
new file mode 100755
index 00000000000..68177169f81
--- /dev/null
+++ b/egs/bn/s5/local/tuning/run_segmentation_wsj_a.sh
@@ -0,0 +1,420 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to re-segment long audios into short segments.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# bigram language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+. ./cmd.sh
+. ./path.sh
+
+segment_stage=-8
+nj=40
+reco_nj=80
+affix=a
+new_affix=2a
+
+. utils/parse_options.sh
+
+false && {
+###############################################################################
+## Simulate unsegmented data directory.
+###############################################################################
+utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
+  data/train_long exp/make_mfcc/train_long mfcc || exit 1
+steps/compute_cmvn_stats.sh data/train_long \
+  exp/make_mfcc/train_long mfcc
+utils/fix_data_dir.sh data/train_long
+
+###############################################################################
+## Train WSJ models.
+###############################################################################
+
+steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
+
+steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
+
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3)
+###############################################################################
+
+true && {
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp false \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/wsj_tri3 data/lang_nosp data/train_long data/train_long/text \
+  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
+  data/train_reseg_${affix}_spk30sec
+steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
+  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
+    exp/tri4_${affix}/decode_nosp_${dset}_rescore
+done
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on tri4_a
+###############################################################################
+
+true && {
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp false \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
+  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
+  exp/make_mfcc/train_reseg_${new_affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${new_affix}
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
+    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
+done
+}
+
+cleanup_stage=-1
+cleanup_affix=cleaned
+srcdir=exp/tri4_${new_affix}
+cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+  --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  $srcdir $dir $cleaned_data
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  5000 100000 $cleaned_data data/lang_nosp \
+  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
+
+utils/mkgraph.sh data/lang_nosp_test \
+  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+done
+
+exit 0
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+# Align tri2b system with reseg${affix} data
+steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
+
+steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
+
+affix=d
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj 80 \
+  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
+  exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix} data/lang_nosp \
+  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test_tgpr \
+  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
+    exp/tri3${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
+  exp/tri4${affix}/graph_nosp
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
+    exp/tri4${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+###############################################################################
+# Train new model on segmented data directory starting from a better model
+# (tri3b)
+###############################################################################
+
+# Align tri3b system with reseg data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_a : Cleanup the segmented data directory using tri3b model.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
+
+###############################################################################
+# Train new model on the cleaned_a data directory
+###############################################################################
+
+# Align tri3b system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_b
+
+###############################################################################
+# Train new model on the cleaned_b data directory
+###############################################################################
+
+# Align tri3c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_c
+
+###############################################################################
+# Train new model on the cleaned_c data directory
+###############################################################################
+
+# Align tri4c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_b.sh b/egs/bn/s5/local/tuning/run_segmentation_wsj_b.sh
new file mode 100755
index 00000000000..6989be69650
--- /dev/null
+++ b/egs/bn/s5/local/tuning/run_segmentation_wsj_b.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to re-segment long audios into short segments.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# bigram language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+# This is similar to _a but uses a diarized version of train data directory. 
+
+. ./cmd.sh
+. ./path.sh
+
+segment_stage=-14
+nj=40
+reco_nj=80
+affix=b
+new_affix=2b
+
+false && {
+###############################################################################
+## Simulate unsegmented data directory.
+###############################################################################
+utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
+  data/train_long exp/make_mfcc/train_long mfcc || exit 1
+steps/compute_cmvn_stats.sh data/train_long \
+  exp/make_mfcc/train_long mfcc
+utils/fix_data_dir.sh data/train_long
+
+###############################################################################
+## Train WSJ models.
+###############################################################################
+
+steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
+
+steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
+
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3)
+###############################################################################
+
+true && {
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp false \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/wsj_tri3 data/lang_nosp data/train_reseg_a_diarized \
+  data/train_long/text \
+  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
+  data/train_reseg_${affix}_spk30sec
+steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
+  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
+    exp/tri4_${affix}/decode_nosp_${dset}_rescore
+done
+}
+
+
+
+exit 0
+
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+# Align tri2b system with reseg${affix} data
+steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
+
+steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
+
+affix=d
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj 80 \
+  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
+  exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix} data/lang_nosp \
+  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test_tgpr \
+  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
+    exp/tri3${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
+  exp/tri4${affix}/graph_nosp
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
+    exp/tri4${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+###############################################################################
+# Train new model on segmented data directory starting from a better model
+# (tri3b)
+###############################################################################
+
+# Align tri3b system with reseg data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_a : Cleanup the segmented data directory using tri3b model.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
+
+###############################################################################
+# Train new model on the cleaned_a data directory
+###############################################################################
+
+# Align tri3b system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_b
+
+###############################################################################
+# Train new model on the cleaned_b data directory
+###############################################################################
+
+# Align tri3c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_c
+
+###############################################################################
+# Train new model on the cleaned_c data directory
+###############################################################################
+
+# Align tri4c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_d.sh b/egs/bn/s5/local/tuning/run_segmentation_wsj_d.sh
new file mode 100755
index 00000000000..6f73ced05ed
--- /dev/null
+++ b/egs/bn/s5/local/tuning/run_segmentation_wsj_d.sh
@@ -0,0 +1,425 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to re-segment long audios into short segments.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# bigram language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+# This is similar to _a but aligns full hypothesis with reference.
+
+. ./cmd.sh
+. ./path.sh
+
+segment_stage=8
+nj=40
+reco_nj=80
+affix=d
+new_affix=2d
+
+. utils/parse_options.sh
+
+###############################################################################
+## Simulate unsegmented data directory.
+###############################################################################
+utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
+  data/train_long exp/make_mfcc/train_long mfcc || exit 1
+steps/compute_cmvn_stats.sh data/train_long \
+  exp/make_mfcc/train_long mfcc
+utils/fix_data_dir.sh data/train_long
+
+###############################################################################
+## Train WSJ models.
+###############################################################################
+false && {
+
+steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
+
+steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
+
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3)
+###############################################################################
+
+true && {
+
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp true \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/wsj_tri3 data/lang_nosp data/train_long data/train_long/text \
+  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
+  data/train_reseg_${affix}_spk30sec
+steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
+  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
+    exp/tri4_${affix}/decode_nosp_${dset}_rescore
+done
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on tri4_a
+###############################################################################
+true && {
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp true \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
+  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
+  exp/make_mfcc/train_reseg_${new_affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${new_affix}
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
+    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
+done
+}
+
+cleanup_stage=-1
+cleanup_affix=cleaned
+srcdir=exp/tri4_${new_affix}
+cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+true && {
+
+steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+  --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  $srcdir $dir $cleaned_data
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  5000 100000 $cleaned_data data/lang_nosp \
+  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
+}
+
+utils/mkgraph.sh data/lang_nosp_test \
+  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+done
+
+exit 0
+
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+# Align tri2b system with reseg${affix} data
+steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
+
+steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
+
+affix=d
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj 80 \
+  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
+  exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix} data/lang_nosp \
+  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test_tgpr \
+  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
+    exp/tri3${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
+  exp/tri4${affix}/graph_nosp
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
+    exp/tri4${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+###############################################################################
+# Train new model on segmented data directory starting from a better model
+# (tri3b)
+###############################################################################
+
+# Align tri3b system with reseg data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_a : Cleanup the segmented data directory using tri3b model.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
+
+###############################################################################
+# Train new model on the cleaned_a data directory
+###############################################################################
+
+# Align tri3b system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_b
+
+###############################################################################
+# Train new model on the cleaned_b data directory
+###############################################################################
+
+# Align tri3c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_c
+
+###############################################################################
+# Train new model on the cleaned_c data directory
+###############################################################################
+
+# Align tri4c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_e.sh b/egs/bn/s5/local/tuning/run_segmentation_wsj_e.sh
new file mode 100755
index 00000000000..0ac507c9d74
--- /dev/null
+++ b/egs/bn/s5/local/tuning/run_segmentation_wsj_e.sh
@@ -0,0 +1,421 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to re-segment long audios into short segments.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# bigram language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+# This is similart to _a, but uses a automatically segmented data directory.
+
+. ./cmd.sh
+. ./path.sh
+
+segment_stage=-8
+nj=40
+reco_nj=80
+affix=e
+new_affix=2e
+
+. utils/parse_options.sh
+
+false && {
+###############################################################################
+## Simulate unsegmented data directory.
+###############################################################################
+utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
+  data/train_long exp/make_mfcc/train_long mfcc || exit 1
+steps/compute_cmvn_stats.sh data/train_long \
+  exp/make_mfcc/train_long mfcc
+utils/fix_data_dir.sh data/train_long
+
+###############################################################################
+## Train WSJ models.
+###############################################################################
+
+steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
+
+steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
+
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3)
+###############################################################################
+
+true && {
+bash -x steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp false \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/wsj_tri3 data/lang_nosp data/train_long.seg_lstm_1e_sad_music data/train_long/text \
+  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
+  data/train_reseg_${affix}_spk30sec
+steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
+  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
+    exp/tri4_${affix}/decode_nosp_${dset}_rescore
+done
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on tri4_a
+###############################################################################
+
+true && {
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp false \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
+  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
+  exp/make_mfcc/train_reseg_${new_affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${new_affix}
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
+    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
+done
+}
+
+cleanup_stage=-1
+cleanup_affix=cleaned
+srcdir=exp/tri4_${new_affix}
+cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+  --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  $srcdir $dir $cleaned_data
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  5000 100000 $cleaned_data data/lang_nosp \
+  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
+
+utils/mkgraph.sh data/lang_nosp_test \
+  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+done
+
+exit 0
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+# Align tri2b system with reseg${affix} data
+steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
+
+steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
+
+affix=d
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj 80 \
+  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
+  exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix} data/lang_nosp \
+  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test_tgpr \
+  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
+    exp/tri3${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
+  exp/tri4${affix}/graph_nosp
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
+    exp/tri4${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+###############################################################################
+# Train new model on segmented data directory starting from a better model
+# (tri3b)
+###############################################################################
+
+# Align tri3b system with reseg data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_a : Cleanup the segmented data directory using tri3b model.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
+
+###############################################################################
+# Train new model on the cleaned_a data directory
+###############################################################################
+
+# Align tri3b system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_b
+
+###############################################################################
+# Train new model on the cleaned_b data directory
+###############################################################################
+
+# Align tri3c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_c
+
+###############################################################################
+# Train new model on the cleaned_c data directory
+###############################################################################
+
+# Align tri4c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_f.sh b/egs/bn/s5/local/tuning/run_segmentation_wsj_f.sh
new file mode 100755
index 00000000000..c56ce2da19b
--- /dev/null
+++ b/egs/bn/s5/local/tuning/run_segmentation_wsj_f.sh
@@ -0,0 +1,421 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to re-segment long audios into short segments.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# bigram language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+# This is similar to _e, but aligns full hypothesis using modified-Levenshtein alignment.
+
+. ./cmd.sh
+. ./path.sh
+
+segment_stage=-8
+nj=40
+reco_nj=80
+affix=f
+new_affix=2f
+
+. utils/parse_options.sh
+
+false && {
+###############################################################################
+## Simulate unsegmented data directory.
+###############################################################################
+utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
+  data/train_long exp/make_mfcc/train_long mfcc || exit 1
+steps/compute_cmvn_stats.sh data/train_long \
+  exp/make_mfcc/train_long mfcc
+utils/fix_data_dir.sh data/train_long
+
+###############################################################################
+## Train WSJ models.
+###############################################################################
+
+steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
+
+steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
+
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3)
+###############################################################################
+
+false && {
+bash -x steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp true \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/wsj_tri3 data/lang_nosp data/train_long.seg_lstm_1e_sad_music data/train_long/text \
+  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
+  data/train_reseg_${affix}_spk30sec
+steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
+  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
+    exp/tri4_${affix}/decode_nosp_${dset}_rescore
+done
+}
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on Smith-Waterman alignment.
+# Use a SAT model trained on tri4_a
+###############################################################################
+
+true && {
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp true \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
+  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
+  exp/make_mfcc/train_reseg_${new_affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${new_affix}
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
+    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
+done
+}
+
+cleanup_stage=-1
+cleanup_affix=cleaned
+srcdir=exp/tri4_${new_affix}
+cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+  --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  $srcdir $dir $cleaned_data
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  5000 100000 $cleaned_data data/lang_nosp \
+  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
+
+utils/mkgraph.sh data/lang_nosp_test \
+  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+done
+
+exit 0
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+# Align tri2b system with reseg${affix} data
+steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
+
+steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
+  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
+
+affix=d
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj 80 \
+  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
+  exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+###############################################################################
+# Train new model on segmented data directory starting from the same model
+# used for segmentation. (tri2b)
+###############################################################################
+
+steps/align_si.sh --nj 40 --cmd "$train_cmd" \
+  data/train_reseg_${affix} \
+  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix} data/lang_nosp \
+  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test_tgpr \
+  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
+    exp/tri3${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
+
+(
+utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
+  exp/tri4${affix}/graph_nosp
+for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
+    exp/tri4${affix}/decode_nosp_${dset}_rescore
+done
+) &
+
+exit 0
+
+###############################################################################
+# Train new model on segmented data directory starting from a better model
+# (tri3b)
+###############################################################################
+
+# Align tri3b system with reseg data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}
+
+# Train SAT system on reseg data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_a : Cleanup the segmented data directory using tri3b model.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
+
+###############################################################################
+# Train new model on the cleaned_a data directory
+###############################################################################
+
+# Align tri3b system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
+  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_b
+
+###############################################################################
+# Train new model on the cleaned_b data directory
+###############################################################################
+
+# Align tri3c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
+  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
+###############################################################################
+# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
+# model, which is a like a first-pass model trained on the resegmented data.
+###############################################################################
+
+steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
+  --nj 80 \
+  data/train_si284_reseg${affix} data/lang_nosp \
+  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
+  data/train_si284_reseg${affix}_cleaned_c
+
+###############################################################################
+# Train new model on the cleaned_c data directory
+###############################################################################
+
+# Align tri4c_reseg system with cleaned data
+steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
+
+# Train SAT system on cleaned data
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
+  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
+
+(
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
+   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
+ steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
+ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
+   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
+) &
+
diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh
index 4614efd328f..75902ca7f82 100755
--- a/egs/bn/s5/run.sh
+++ b/egs/bn/s5/run.sh
@@ -13,7 +13,6 @@ set -o pipefail
 mfccdir=`pwd`/mfcc
 nj=40
 
-false && {
 # Prepare 1996 English Broadcast News Train (HUB4)
 local/data_prep/prepare_1996_bn_data.sh \
   /export/corpora/LDC/LDC97T22/hub4_eng_train_trans \
@@ -135,7 +134,6 @@ steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
   data/train data/lang_nosp exp/tri3_ali exp/tri4
 
 utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
-}
 
 for dset in eval97.pem; do
   this_nj=`cat data/$dset/spk2utt | wc -l`

From a528552e75281cfca794f386c0894397a1c3947d Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 8 Nov 2017 17:34:05 -0500
Subject: [PATCH 18/38] bn: Remove some unused scripts

---
 egs/bn/s5/conf/merge_vad_map.txt              | 16 ----------
 .../data_prep/csr_hub4_utils/numproc.perl     |  3 +-
 .../csr_hub4_utils/process_filelist.sh        | 30 -------------------
 .../prepare_1996_csr_hub4_lm_corpus.sh        |  2 +-
 ...y => process_1996_csr_hub4_lm_filelist.py} |  0
 egs/bn/s5/run.sh                              | 20 ++++++++++++-
 6 files changed, 21 insertions(+), 50 deletions(-)
 delete mode 100644 egs/bn/s5/conf/merge_vad_map.txt
 delete mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh
 rename egs/bn/s5/local/data_prep/{csr_hub4_utils/process_filelist.py => process_1996_csr_hub4_lm_filelist.py} (100%)

diff --git a/egs/bn/s5/conf/merge_vad_map.txt b/egs/bn/s5/conf/merge_vad_map.txt
deleted file mode 100644
index 216dee78b65..00000000000
--- a/egs/bn/s5/conf/merge_vad_map.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# This table defines the mapping used by the binary merge-vads to 
-# combine the output of compute-vad and compute-vad-from-frame-likes.
-# The first column corresponds to VAD decisions from compute-vad
-# and the second corresponds to VAD decisions from
-# compute-vad-from-frame-likes.  The labels "0" and "1" in the
-# first column represent (approximately) silence and nonsilence
-# respectively.  The labels "0," "1," and "2" in the second column
-# represent noise, speech, and music, respectively.  The third
-# column lists the resulting output labels: "0," "1," and "2" 
-# corresponding to silence/noise, speech, and music. 
-0 0 0
-1 0 0
-0 1 0
-1 1 1
-0 2 0
-1 2 2
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl
index e97d3ae51dd..5cfc34132ec 100755
--- a/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl
+++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl
@@ -78,7 +78,7 @@
 		{	$exfile=$ARGV[$i];
 			$exfile =~ s/^-x//;
 		}
-		else {&perr2("illegal flag: $ARGV[$i]"); }
+		else {&perr2("illegal flag: $ARGV[$i]");}
 	}
 	else { &perr2("no file args"); }
 }
@@ -1125,7 +1125,6 @@ sub perr
 	$appendflg=0;
 	$commanextflg=0;
 	&pusho($this);
-# $field++;		# graceful error recovery
 }
 
 sub perr2
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh
deleted file mode 100755
index 15249ae9a19..00000000000
--- a/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#! /bin/bash
-
-set -e 
-set -o pipefail
-set -u
-set -x
-
-if [ $# -ne 2 ]; then
-  echo "Usage: $0 <filelist> <dir>"
-  exit 1
-fi
-
-filelist=$1
-dir=$2
-
-export PATH=local/data_prep/csr_hub4_utils:$PATH
-
-for file in `cat $filelist`; do
-	BASENM=`basename $file`
-  name="${BASENM%.*}"
-
-	echo "Running LM pipeline for |$BASENM|..." 1>&2
-  gunzip -c $file | pare-sgml.perl | \
-    bugproc.perl | \
-    numhack.perl | \
-    numproc.perl -xlocal/data_prep/csr_hub4_utils/num_excp | \
-    abbrproc.perl local/data_prep/csr_hub4_utils/abbrlist | \
-    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
-	echo "Done with $BASENM."
-done
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
index 444a491c7b8..f0865daa890 100755
--- a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
+++ b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
@@ -41,7 +41,7 @@ mkdir -p $dir/split$nj/
 if [ $stage -le 1 ]; then
   eval utils/split_scp.pl $dir/filelist $dir/split$nj/filelist.{`seq -s, $nj`}
   $cmd JOB=1:$nj $dir/log/process_text.JOB.log \
-    local/data_prep/csr_hub4_utils/process_filelist.py \
+    local/data_prep/process_1996_csr_hub4_lm_filelist.py \
     $dir/split$nj/filelist.JOB $dir
 fi
 
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py b/egs/bn/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py
rename to egs/bn/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh
index 75902ca7f82..34cd65ee15a 100755
--- a/egs/bn/s5/run.sh
+++ b/egs/bn/s5/run.sh
@@ -25,6 +25,24 @@ local/data_prep/prepare_1997_bn_data.sh \
   /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 \
   data/local/data/train_bn97
 
+# Install Beautiful Soup 4 python package
+if [ ! -d tools/beautifulsoup4 ]; then
+  mkdir -p tools
+  pip install -t tools/beautifulsoup4 beautifulsoup4
+fi
+export PYTHONPATH=$PWD/tools/beautifulsoup4:$PYTHONPATH
+
+if [ ! -f /export/corpora/LDC/LDC98T31/1996_csr_hub4_model/utils.tar ]; then
+  echo "Expected CSR-IV utils.tar to be found"
+  exit 1
+fi
+
+mkdir -p tools/csr4_utils
+(
+cd tools/csr4_utils
+tar -xvf /export/corpora/LDC/LDC98T31/1996_csr_hub4_model/utils.tar
+)
+
 # Prepare 1995 CSR-IV HUB4 corpus
 local/data_prep/prepare_1995_csr_hub4_corpus.sh \
   /export/corpora5/LDC/LDC96S31/csr95_hub4/ data/local/data/csr95_hub4
@@ -39,7 +57,7 @@ local/data/prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
 
 # Prepare 1996 CSR HUB4 Language Model
 local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
-  /export/corpora/LDC/LDC98T31/1996_csr_hub4_model data/local/data/csr96_hub4
+   data/local/data/csr96_hub4
 
 # Prepare 1996 English Broadcast News Dev and Eval (HUB4)
 local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \

From 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 21 Nov 2017 13:01:14 -0500
Subject: [PATCH 19/38] bn: rename bn to hub4_english

---
 egs/{bn => hub4_english}/s5/README                                | 0
 egs/{bn => hub4_english}/s5/cmd.sh                                | 0
 egs/{bn => hub4_english}/s5/conf/mfcc.conf                        | 0
 egs/{bn => hub4_english}/s5/conf/vad.conf                         | 0
 .../s5/local/data_prep/csr_hub4_utils/INVENTORY                   | 0
 egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/README | 0
 .../s5/local/data_prep/csr_hub4_utils/abbrlist                    | 0
 .../s5/local/data_prep/csr_hub4_utils/abbrproc.perl               | 0
 .../s5/local/data_prep/csr_hub4_utils/addressforms                | 0
 .../s5/local/data_prep/csr_hub4_utils/artfilter.perl              | 0
 .../s5/local/data_prep/csr_hub4_utils/bugproc.perl                | 0
 egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/do-lm  | 0
 .../s5/local/data_prep/csr_hub4_utils/eval-material.ptrns         | 0
 .../s5/local/data_prep/csr_hub4_utils/num_excp                    | 0
 .../s5/local/data_prep/csr_hub4_utils/numhack.perl                | 0
 .../s5/local/data_prep/csr_hub4_utils/numproc.perl                | 0
 .../s5/local/data_prep/csr_hub4_utils/pare-sgml.perl              | 0
 .../s5/local/data_prep/csr_hub4_utils/progsummary.perl            | 0
 .../s5/local/data_prep/csr_hub4_utils/puncproc.perl               | 0
 .../s5/local/data_prep/csr_hub4_utils/sent-init.vocab             | 0
 .../s5/local/data_prep/csr_hub4_utils/sentag.c                    | 0
 .../s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl        | 0
 .../s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl        | 0
 egs/{bn => hub4_english}/s5/local/data_prep/do-lm-csr96           | 0
 .../s5/local/data_prep/format_1996_bn_data.pl                     | 0
 .../s5/local/data_prep/format_1997_bn_data.pl                     | 0
 egs/{bn => hub4_english}/s5/local/data_prep/hub4_utils.py         | 0
 .../s5/local/data_prep/parse_sgm_1996_hub4_eng.pl                 | 0
 .../s5/local/data_prep/parse_sgm_1997_hub4_eng.pl                 | 0
 .../s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh            | 0
 .../s5/local/data_prep/prepare_1996_bn_data.sh                    | 0
 .../s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh         | 0
 .../s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh   | 0
 .../s5/local/data_prep/prepare_1997_bn_data.sh                    | 0
 .../s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh           | 0
 .../s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh           | 0
 .../s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh           | 0
 .../s5/local/data_prep/prepare_na_news_text_corpus.sh             | 0
 .../s5/local/data_prep/prepare_na_news_text_supplement.sh         | 0
 .../s5/local/data_prep/process_1995_bn_annotation.py              | 0
 .../s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py       | 0
 .../s5/local/data_prep/process_na_news_text.py                    | 0
 egs/{bn => hub4_english}/s5/local/dict                            | 0
 egs/{bn => hub4_english}/s5/local/format_data.sh                  | 0
 egs/{bn => hub4_english}/s5/local/format_lms.sh                   | 0
 egs/{bn => hub4_english}/s5/local/lm/merge_word_counts.py         | 0
 egs/{bn => hub4_english}/s5/local/normalize_transcripts.pl        | 0
 egs/{bn => hub4_english}/s5/local/prepare_dict.sh                 | 0
 egs/{bn => hub4_english}/s5/local/run_cleanup_segmentation.sh     | 0
 egs/{bn => hub4_english}/s5/local/score.sh                        | 0
 egs/{bn => hub4_english}/s5/local/score_sclite.sh                 | 0
 egs/{bn => hub4_english}/s5/local/train_lm.sh                     | 0
 .../s5/local/tuning/run_segmentation_wsj_a.sh                     | 0
 .../s5/local/tuning/run_segmentation_wsj_b.sh                     | 0
 .../s5/local/tuning/run_segmentation_wsj_d.sh                     | 0
 .../s5/local/tuning/run_segmentation_wsj_e.sh                     | 0
 .../s5/local/tuning/run_segmentation_wsj_f.sh                     | 0
 egs/{bn => hub4_english}/s5/path.sh                               | 0
 egs/{bn => hub4_english}/s5/run.sh                                | 0
 egs/{bn => hub4_english}/s5/steps                                 | 0
 egs/{bn => hub4_english}/s5/utils                                 | 0
 61 files changed, 0 insertions(+), 0 deletions(-)
 rename egs/{bn => hub4_english}/s5/README (100%)
 rename egs/{bn => hub4_english}/s5/cmd.sh (100%)
 rename egs/{bn => hub4_english}/s5/conf/mfcc.conf (100%)
 rename egs/{bn => hub4_english}/s5/conf/vad.conf (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/INVENTORY (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/README (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/abbrlist (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/abbrproc.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/addressforms (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/artfilter.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/bugproc.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/do-lm (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/num_excp (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/numhack.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/numproc.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/progsummary.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/puncproc.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/sent-init.vocab (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/sentag.c (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/do-lm-csr96 (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/format_1996_bn_data.pl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/format_1997_bn_data.pl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/hub4_utils.py (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_1996_bn_data.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_1997_bn_data.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_na_news_text_corpus.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/prepare_na_news_text_supplement.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/process_1995_bn_annotation.py (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py (100%)
 rename egs/{bn => hub4_english}/s5/local/data_prep/process_na_news_text.py (100%)
 rename egs/{bn => hub4_english}/s5/local/dict (100%)
 rename egs/{bn => hub4_english}/s5/local/format_data.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/format_lms.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/lm/merge_word_counts.py (100%)
 rename egs/{bn => hub4_english}/s5/local/normalize_transcripts.pl (100%)
 rename egs/{bn => hub4_english}/s5/local/prepare_dict.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/run_cleanup_segmentation.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/score.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/score_sclite.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/train_lm.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/tuning/run_segmentation_wsj_a.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/tuning/run_segmentation_wsj_b.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/tuning/run_segmentation_wsj_d.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/tuning/run_segmentation_wsj_e.sh (100%)
 rename egs/{bn => hub4_english}/s5/local/tuning/run_segmentation_wsj_f.sh (100%)
 rename egs/{bn => hub4_english}/s5/path.sh (100%)
 rename egs/{bn => hub4_english}/s5/run.sh (100%)
 rename egs/{bn => hub4_english}/s5/steps (100%)
 rename egs/{bn => hub4_english}/s5/utils (100%)

diff --git a/egs/bn/s5/README b/egs/hub4_english/s5/README
similarity index 100%
rename from egs/bn/s5/README
rename to egs/hub4_english/s5/README
diff --git a/egs/bn/s5/cmd.sh b/egs/hub4_english/s5/cmd.sh
similarity index 100%
rename from egs/bn/s5/cmd.sh
rename to egs/hub4_english/s5/cmd.sh
diff --git a/egs/bn/s5/conf/mfcc.conf b/egs/hub4_english/s5/conf/mfcc.conf
similarity index 100%
rename from egs/bn/s5/conf/mfcc.conf
rename to egs/hub4_english/s5/conf/mfcc.conf
diff --git a/egs/bn/s5/conf/vad.conf b/egs/hub4_english/s5/conf/vad.conf
similarity index 100%
rename from egs/bn/s5/conf/vad.conf
rename to egs/hub4_english/s5/conf/vad.conf
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/INVENTORY
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/INVENTORY
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/README b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/README
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/README
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/README
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrlist
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrlist
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/addressforms
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/addressforms
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/artfilter.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/artfilter.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/bugproc.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/bugproc.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/do-lm
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/do-lm
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/num_excp
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/num_excp
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numhack.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numhack.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numproc.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numproc.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/progsummary.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/progsummary.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/puncproc.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/puncproc.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sentag.c
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sentag.c
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
similarity index 100%
rename from egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
rename to egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
diff --git a/egs/bn/s5/local/data_prep/do-lm-csr96 b/egs/hub4_english/s5/local/data_prep/do-lm-csr96
similarity index 100%
rename from egs/bn/s5/local/data_prep/do-lm-csr96
rename to egs/hub4_english/s5/local/data_prep/do-lm-csr96
diff --git a/egs/bn/s5/local/data_prep/format_1996_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
similarity index 100%
rename from egs/bn/s5/local/data_prep/format_1996_bn_data.pl
rename to egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
diff --git a/egs/bn/s5/local/data_prep/format_1997_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
similarity index 100%
rename from egs/bn/s5/local/data_prep/format_1997_bn_data.pl
rename to egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
diff --git a/egs/bn/s5/local/data_prep/hub4_utils.py b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
similarity index 100%
rename from egs/bn/s5/local/data_prep/hub4_utils.py
rename to egs/hub4_english/s5/local/data_prep/hub4_utils.py
diff --git a/egs/bn/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
similarity index 100%
rename from egs/bn/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
rename to egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
diff --git a/egs/bn/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
similarity index 100%
rename from egs/bn/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
rename to egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
diff --git a/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_bn_data.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_1996_bn_data.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_1997_bn_data.sh b/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_1997_bn_data.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
similarity index 100%
rename from egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh
rename to egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
diff --git a/egs/bn/s5/local/data_prep/process_1995_bn_annotation.py b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
similarity index 100%
rename from egs/bn/s5/local/data_prep/process_1995_bn_annotation.py
rename to egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
diff --git a/egs/bn/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
similarity index 100%
rename from egs/bn/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
rename to egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
diff --git a/egs/bn/s5/local/data_prep/process_na_news_text.py b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
similarity index 100%
rename from egs/bn/s5/local/data_prep/process_na_news_text.py
rename to egs/hub4_english/s5/local/data_prep/process_na_news_text.py
diff --git a/egs/bn/s5/local/dict b/egs/hub4_english/s5/local/dict
similarity index 100%
rename from egs/bn/s5/local/dict
rename to egs/hub4_english/s5/local/dict
diff --git a/egs/bn/s5/local/format_data.sh b/egs/hub4_english/s5/local/format_data.sh
similarity index 100%
rename from egs/bn/s5/local/format_data.sh
rename to egs/hub4_english/s5/local/format_data.sh
diff --git a/egs/bn/s5/local/format_lms.sh b/egs/hub4_english/s5/local/format_lms.sh
similarity index 100%
rename from egs/bn/s5/local/format_lms.sh
rename to egs/hub4_english/s5/local/format_lms.sh
diff --git a/egs/bn/s5/local/lm/merge_word_counts.py b/egs/hub4_english/s5/local/lm/merge_word_counts.py
similarity index 100%
rename from egs/bn/s5/local/lm/merge_word_counts.py
rename to egs/hub4_english/s5/local/lm/merge_word_counts.py
diff --git a/egs/bn/s5/local/normalize_transcripts.pl b/egs/hub4_english/s5/local/normalize_transcripts.pl
similarity index 100%
rename from egs/bn/s5/local/normalize_transcripts.pl
rename to egs/hub4_english/s5/local/normalize_transcripts.pl
diff --git a/egs/bn/s5/local/prepare_dict.sh b/egs/hub4_english/s5/local/prepare_dict.sh
similarity index 100%
rename from egs/bn/s5/local/prepare_dict.sh
rename to egs/hub4_english/s5/local/prepare_dict.sh
diff --git a/egs/bn/s5/local/run_cleanup_segmentation.sh b/egs/hub4_english/s5/local/run_cleanup_segmentation.sh
similarity index 100%
rename from egs/bn/s5/local/run_cleanup_segmentation.sh
rename to egs/hub4_english/s5/local/run_cleanup_segmentation.sh
diff --git a/egs/bn/s5/local/score.sh b/egs/hub4_english/s5/local/score.sh
similarity index 100%
rename from egs/bn/s5/local/score.sh
rename to egs/hub4_english/s5/local/score.sh
diff --git a/egs/bn/s5/local/score_sclite.sh b/egs/hub4_english/s5/local/score_sclite.sh
similarity index 100%
rename from egs/bn/s5/local/score_sclite.sh
rename to egs/hub4_english/s5/local/score_sclite.sh
diff --git a/egs/bn/s5/local/train_lm.sh b/egs/hub4_english/s5/local/train_lm.sh
similarity index 100%
rename from egs/bn/s5/local/train_lm.sh
rename to egs/hub4_english/s5/local/train_lm.sh
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
similarity index 100%
rename from egs/bn/s5/local/tuning/run_segmentation_wsj_a.sh
rename to egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_b.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
similarity index 100%
rename from egs/bn/s5/local/tuning/run_segmentation_wsj_b.sh
rename to egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_d.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh
similarity index 100%
rename from egs/bn/s5/local/tuning/run_segmentation_wsj_d.sh
rename to egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_e.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh
similarity index 100%
rename from egs/bn/s5/local/tuning/run_segmentation_wsj_e.sh
rename to egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh
diff --git a/egs/bn/s5/local/tuning/run_segmentation_wsj_f.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh
similarity index 100%
rename from egs/bn/s5/local/tuning/run_segmentation_wsj_f.sh
rename to egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh
diff --git a/egs/bn/s5/path.sh b/egs/hub4_english/s5/path.sh
similarity index 100%
rename from egs/bn/s5/path.sh
rename to egs/hub4_english/s5/path.sh
diff --git a/egs/bn/s5/run.sh b/egs/hub4_english/s5/run.sh
similarity index 100%
rename from egs/bn/s5/run.sh
rename to egs/hub4_english/s5/run.sh
diff --git a/egs/bn/s5/steps b/egs/hub4_english/s5/steps
similarity index 100%
rename from egs/bn/s5/steps
rename to egs/hub4_english/s5/steps
diff --git a/egs/bn/s5/utils b/egs/hub4_english/s5/utils
similarity index 100%
rename from egs/bn/s5/utils
rename to egs/hub4_english/s5/utils

From acbcc2c137a8857b913f99f85fbce1002475b3ba Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 27 Nov 2017 15:51:08 -0500
Subject: [PATCH 20/38] bn: Adding patch instead of copying corpus files

---
 .../s5/local/data_prep/csr4_utils.patch       |  793 ++++++
 .../local/data_prep/csr_hub4_utils/INVENTORY  |   56 -
 .../s5/local/data_prep/csr_hub4_utils/README  |   34 -
 .../local/data_prep/csr_hub4_utils/abbrlist   | 2403 -----------------
 .../data_prep/csr_hub4_utils/abbrproc.perl    |  465 ----
 .../data_prep/csr_hub4_utils/addressforms     |   38 -
 .../data_prep/csr_hub4_utils/artfilter.perl   |   83 -
 .../data_prep/csr_hub4_utils/bugproc.perl     |   69 -
 .../s5/local/data_prep/csr_hub4_utils/do-lm   |   43 -
 .../csr_hub4_utils/eval-material.ptrns        |    4 -
 .../local/data_prep/csr_hub4_utils/num_excp   |  528 ----
 .../data_prep/csr_hub4_utils/numhack.perl     |   80 -
 .../data_prep/csr_hub4_utils/numproc.perl     | 1133 --------
 .../data_prep/csr_hub4_utils/pare-sgml.perl   |   36 -
 .../data_prep/csr_hub4_utils/progsummary.perl |   44 -
 .../data_prep/csr_hub4_utils/puncproc.perl    |  196 --
 .../data_prep/csr_hub4_utils/sent-init.vocab  |  411 ---
 .../local/data_prep/csr_hub4_utils/sentag.c   |  674 -----
 .../csr_hub4_utils/tr-bn-char.fast.perl       |   13 -
 .../csr_hub4_utils/tr-bn-char.slow.perl       |   46 -
 .../s5/local/data_prep/do-lm-csr96            |   40 -
 egs/hub4_english/s5/run.sh                    |   53 +-
 22 files changed, 833 insertions(+), 6409 deletions(-)
 create mode 100644 egs/hub4_english/s5/local/data_prep/csr4_utils.patch
 delete mode 100644 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/INVENTORY
 delete mode 100644 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/README
 delete mode 100644 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrlist
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
 delete mode 100644 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/addressforms
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/artfilter.perl
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/bugproc.perl
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/do-lm
 delete mode 100644 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
 delete mode 100644 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/num_excp
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numhack.perl
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numproc.perl
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/progsummary.perl
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/puncproc.perl
 delete mode 100644 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
 delete mode 100644 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sentag.c
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
 delete mode 100755 egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
 delete mode 100755 egs/hub4_english/s5/local/data_prep/do-lm-csr96

diff --git a/egs/hub4_english/s5/local/data_prep/csr4_utils.patch b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
new file mode 100644
index 00000000000..7c278f0b451
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
@@ -0,0 +1,793 @@
+diff -Naur tools/csr4_utils/abbrproc.perl local/data_prep/csr_hub4_utils/abbrproc.perl
+--- tools/csr4_utils/abbrproc.perl	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/abbrproc.perl	2017-11-03 13:22:09.466213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ # $Id: abbrproc.perl,v 1.3 1996/08/21 20:05:09 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+diff -Naur tools/csr4_utils/artfilter.perl local/data_prep/csr_hub4_utils/artfilter.perl
+--- tools/csr4_utils/artfilter.perl	1996-01-04 11:31:57.000000000 -0500
++++ local/data_prep/csr_hub4_utils/artfilter.perl	2017-11-03 13:22:09.470213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # artfilter.perl 
+ 
+diff -Naur tools/csr4_utils/bugproc.perl local/data_prep/csr_hub4_utils/bugproc.perl
+--- tools/csr4_utils/bugproc.perl	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/bugproc.perl	2017-11-03 13:22:09.474213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ # $Id: bugproc.perl,v 1.4 1996/08/21 23:55:40 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+diff -Naur tools/csr4_utils/do-lm local/data_prep/csr_hub4_utils/do-lm
+--- tools/csr4_utils/do-lm	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/do-lm	2017-11-27 14:21:15.965400509 -0500
+@@ -22,19 +22,22 @@
+ 	exit 1
+ fi
+ 
+-PATH=$PATH:./bin ; export PATH
++dir=$1
++shift
+ 
+ for file in $*
+ do
+ 	BASENM=`basename $file`
++  name="${BASENM%.*}"
++
+ 	echo "Running LM pipeline for |$BASENM|..." 1>&2
+ 	set -x
+-	pare-sgml.perl $file |
+-	 bugproc.perl |
+-	 numhack.perl |
+-	 numproc.perl |
+-	 abbrproc.perl |
+-	 puncproc.perl > lm/$BASENM
++  gunzip -c $file | pare-sgml.perl | \
++    bugproc.perl | \
++    numhack.perl | \
++    numproc.perl -xtools/csr4_utils/num_excp | \
++    abbrproc.perl tools/csr4_utils/abbrlist | \
++    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
+ 	set +x
+ 	echo "Done with $BASENM."
+ done
+diff -Naur tools/csr4_utils/numhack.perl local/data_prep/csr_hub4_utils/numhack.perl
+--- tools/csr4_utils/numhack.perl	1996-08-27 15:25:16.000000000 -0400
++++ local/data_prep/csr_hub4_utils/numhack.perl	2017-11-03 13:22:09.482213158 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: numhack.perl,v 1.4 1996/08/23 05:12:27 robertm Rel $
+ # preprocessor for numproc, potentially specialized for Broadcast News material
+diff -Naur tools/csr4_utils/numproc.perl local/data_prep/csr_hub4_utils/numproc.perl
+--- tools/csr4_utils/numproc.perl	1996-08-27 15:25:16.000000000 -0400
++++ local/data_prep/csr_hub4_utils/numproc.perl	2017-11-08 16:59:50.497562934 -0500
+@@ -1,4 +1,5 @@
+-#!/usr/local/bin/perl
++#! /usr/bin/perl
++#
+ # $Id: numproc.perl,v 1.7 1996/08/23 05:04:11 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+@@ -74,7 +75,7 @@
+ {	if($ARGV[$i] =~ /^-/)
+ 	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
+ 		elsif($ARGV[$i] =~ /^-x/)
+-		{	$exfile=$ARGV[i];
++		{	$exfile=$ARGV[$i];
+ 			$exfile =~ s/^-x//;
+ 		}
+ 		else {&perr2("illegal flag: $ARGV[$i]");}
+@@ -237,7 +238,7 @@
+ 	if(/\d/ && !/^<\/?[spa]/)		# opt and protect sgml
+ 	{	@input = split(/\s+/o);
+ 		@output=();
+-	wloop:	for($field=0;$field<=$#input;$field++)	# $field is global
++	for($field=0;$field<=$#input;$field++)	# $field is global
+ 		{	if($field>0) {$last=$input[$field-1];}
+ 			else {$last='';}
+ 			if($field<$#input) {$next=$input[$field+1];}
+@@ -248,27 +249,27 @@
+ 			$_=$input[$field];
+ 	
+ 			if(/<[\w\.\/]*>/o && !/<p/o && !/<\/p>/o) # pass only
+-				{&perr("spurious SGML: $_");}	# <p... and </p>
++				{&perr("spurious SGML: $_"); next; }	# <p... and </p>
+ 	
+ 			if(/[0-9]/o && !/<p/o)		# number but not <p
+ 			{	if(/[\$\#]/o)			# money
+-					{&money($_,$next);}
++					{if (! &money($_,$next)) {next;} }
+ 				elsif(/\d:\d\d$/o || /\d:\d\d\D/o)	# time
+-					{&printtime($_);}
++					{if (! &printtime($_)) {next;} }
+ 				elsif(/\d+\/\d+\/\d+/o)		# x/x/x date
+-					{&printdate($_);}
++					{if (! &printdate($_)) {next;} }
+ 				elsif((/[a-zA-Z].*\d/ || /\d.*[a-zA-Z]/)
+ 				      && 
+ 				      !(/\dth\W*/i || /1st\W*/i || /2nd\W*/i
+ 					|| /3rd\W*/i
+ 					|| (/\d\'?s\W*/
+ 					    && (! /\d[a-zA-Z]+\d+\'?s\W*$/))))
+-					{&printserno($_);}	 # serial no
++					{if (! &printserno($_)) {next;} }	 # serial no
+ 				elsif(/\//o)			# fraction
+-					{&printfrac($_);}
++					{if (! &printfrac($_)) {next;} }
+ 				elsif(/\d\'-?\d+/o)		# ft inches
+-					{&printftin($_);}
+-				else {&printnum($_); }	      # ordinary number
++					{if (! &printftin($_)) {next;} }
++				else {if (! &printnum($_)) {next;} }	      # ordinary number
+ 			}
+ 			else {&pusho($_ );}		# non-numeric string
+ 		}
+@@ -348,7 +349,7 @@
+ 		$subunit_sing='penny';
+ 		$subunit_pl='pence';
+ 	}
+-	else {&perr("money: unknown currency");}
++	else {&perr("money: unknown currency"); return 0;}
+ 
+ 	($back)=/(\D*)$/;
+ 	$back =~ s/^s//;	# $40s -> $40
+@@ -362,32 +363,32 @@
+ 	if($x =~ /\//)
+ 	{	$x =~ s/^\D*//;
+ 		$x =~ s/\D*$//;
+-		&printfrac($x);
++		if (! &printfrac($x)) {return 0;}
+ 		&pusho("of a $unit");
+ 		$x="";
+ 		$plural=0;
+ 	}
+ 
+ 	$x =~ s/^\D*([\d,]*)\D*.*$/$1/;		# int part of string
+-	if($x ne "") {&printint($x);}		# print int part (eg. dollars)
++	if($x ne "") {if (! &printint($x)) {return 0;} }		# print int part (eg. dollars)
+ 
+ 	if($next eq "and" && $next2 =~ /\d\/\d/ && next2 !~ /\/.*\//)
+ 	{	if($unit && $x ne "") {&pusho("and");}	      # frac: eg 4 1/16
+ 		$z=$next2;
+ 		$z =~ s/\D*$//;
+-		&printfrac($z);
++		if (! &printfrac($z)) {return 0;}
+ 		($punct)=($next2 =~ /(\D*)$/);
+ 		$field+=2;
+ 		&pusho("${unit}s");
+ 	
+-		if($back) {&perr("money: back and 1 1/3");}
++		if($back) {&perr("money: back and 1 1/3"); return 0;}
+ 		
+ 		if($punct) {&appendo($punct);}	# punctuation from *illion
+-		return;
++		return 1;
+ 	}
+ 
+ 	if($back eq "" && $next =~ /^(thousands?|[a-z]*illions?)(\W*)/i)
+-	{	&printdecfrac($_);			# multiplier
++	{	if (! &printdecfrac($_)) {return 0;}			# multiplier
+ 		&pusho($1);
+ 		$punct=$2;
+ 		$plural=1;			### if adj '', if noun 's'
+@@ -395,7 +396,7 @@
+ 		$frac=1;
+ 	}
+ 	elsif(/\.\d$/ || /\.\d\D/ || /\.\d{3}/ )	# .d or .ddd+
+-	{	&printdecfrac($_);
++	{	if (! &printdecfrac($_)) {return 0;}
+ 		$plural=1;			# can be either
+ 		$frac=1;
+ 	}
+@@ -409,7 +410,7 @@
+ 	{	$unit="";			# fix "$1 dollar" wsj typo
+ 		$subunit_sing="";
+ 		$subunit_pl="";
+-		&printdecfrac($_);
++		if (! &printdecfrac($_)) {return 0;}
+ 		$frac=1;
+ 	}
+ 
+@@ -447,24 +448,26 @@
+ 	{	$y=$_;
+ 		$y =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/;	# get fractional part
+ 		if($unit && $x ne "") {&pusho("and");}
+-		&printint($y);
++		if (! &printint($y)) {return 0;}
+ 		if($sing || int($y)==1) {&pusho($subunit_sing);}
+ 		else {&pusho($subunit_pl);}
+ 	}
+ 
+ 	if($back)				# punctuation from this field
+-	{	if($punct) {&perr("money: back and punct");}
++	{	if($punct) {&perr("money: back and punct"); return 0;}
+ 
+ 		if($back =~ /^\w/) {&pusho($back);}
+ 		else {&appendo($back);}
+ 	}
+ 		
+ 	if($punct) {&appendo($punct);}		# punctuation from *illion
++
++  return 1;
+ }
+ 
+ sub printyear			# &printyear(x)
+ {	if($vflg) {print "printyear: $_[0]\n";}
+-	&printnum($_[0]);		# for now
++	return &printnum($_[0]);		# for now
+ }
+ 
+ sub printtime			# &printtime(x)
+@@ -475,7 +478,7 @@
+ 	local($front);
+ 	local($back);
+ 
+-	if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time");}
++	if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time"); return 0;}
+ 
+ 	@x=split(/:/,$_);
+ 	($front)=($x[0] =~ /^(\D*)/);
+@@ -487,20 +490,21 @@
+ 	{	&pusho($front);			# generally punctuation
+ 		if($front !~ /\w$/) {$appendflg=1;}
+ 	}
+-	&printint($x[0]);
++	if (! &printint($x[0])) {return 0;}
+ 	if($x[1]==0)
+ 	{	$_=$next;
+ 		if(!/^[aApP]\.?[nM]\.?$/) {&pusho("o'clock");}
+ 	}
+ 	elsif ($x[1]<10)
+ 	{	&pusho("oh");
+-		&printint($x[1]);
++		if (!&printint($x[1])) {return 0;}
+ 	}
+-	else {&printint($x[1]);}
++	else {if (! &printint($x[1])) {return 0;} }
+ 	if($back)
+ 	{	if($back =~ /^\w/) {&pusho($back);}
+ 		else {&appendo($back);}		# generally punctuation
+ 	}
++  return 1;
+ }
+ 
+ sub printfrac
+@@ -530,8 +534,8 @@
+ 	}
+ 
+ 	@z=split(/\//,$x);
+-	if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]");}
+-	if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]");}
++	if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]"); return 0;}
++	if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]"); return 0;}
+ 
+ 	if($front) 
+ 	{	&pusho($front);
+@@ -541,22 +545,22 @@
+ 
+ 	if($sign) {&pusho($sign);}
+ 
+-	&printint($z[0]);			#numerator
++	if (! &printint($z[0])) { return 0;}			#numerator
+ 	if($z[1] <= $#den)			# small den from table (<20)
+ 	{	&pusho($den[$z[1]]);
+-		if($z[0]!=1) {&pluralize;}
++		if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 	}
+ 	else					#large den
+ 	{	$ones=int($z[1]%100);
+ 		$hun=100*int($z[1]/100);
+-		if($hun>0) {&printint($hun);}
++		if($hun>0) {if (!&printint($hun)) {return 0;} }
+ 		if($ones==0) 
+ 		{	&appendo("th");
+-			if($z[0]!=1) {&pluralize;}
++			if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 		}
+ 		elsif($ones<=$#largeden)		# <20
+ 		{	&pusho($largeden[$ones]);
+-			if($z[0]!=1) {&pluralize;};
++			if($z[0]!=1) {if (!&pluralize) {return 0;} }
+ 		}
+ 		else
+ 		{	$x=int($ones%10);
+@@ -569,11 +573,11 @@
+ 			}
+ 			if($x==0)
+ 			{	&pusho("th");
+-				if($z[0]!=1) {&pluralize;}
++        if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 			}
+ 			else
+ 			{	&pusho($largeden[$x]);
+-				if($z[0]!=1) {&pluralize;}
++        if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 			}
+ 		}
+ 	}
+@@ -585,6 +589,8 @@
+ 			&appendo($back);
+ 		}
+ 	}
++  
++  return 1;
+ }
+ 
+ sub printnum			# printnum(n)
+@@ -624,7 +630,7 @@
+ 		$x =~ s/\D*$//;			# strip back: final . is punct
+ 	}
+ 
+-	if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number");}
++	if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number"); return 0;}
+ 
+ 	if($x!=0 && $x =~ /^0/ && $x =~ /^\d*$/)	# "oh" numbers
+ 	{	if($front) 
+@@ -641,7 +647,7 @@
+ 
+ 		if($back)
+ 		{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
+-			{	&pluralize;			# eg. 1960s
++			{	if (! &pluralize) {return 0;}			# eg. 1960s
+ 				$back =~ s/^s//;
+ 			}
+ 			if($back)
+@@ -649,7 +655,7 @@
+ 				else {&appendo($back);}	# back = punct or "'s"
+ 			}
+ 		}
+-		return;
++		return 1;
+ 	}
+ 
+ 	if($x =~ /^\d/)			# get integer part
+@@ -675,48 +681,48 @@
+ 	if($sign) { &pusho($sign); }
+ 
+ 	$ones=int($intpart%100);
+-	if($comma) {&printint($intpart);}
++	if($comma) {if (! &printint($intpart)) {return 0;} }
+ 	elsif(($intpart>=1900 || $intpart>=1100 && $ones==0)
+ 		&& $intpart<2000 && !$fracpart)			#4 digit -> 2+2
+ 	{	$hun=int($intpart/100);
+-		&printint($hun);
+-		if($ones>=10) {&printint($ones);}
++		if (! &printint($hun)) {return 0;}
++		if($ones>=10) {if (! &printint($ones)) {return 0;} }
+ 		elsif($ones>0)
+ 		{	&pusho("oh");
+-			&printint($ones);
++			if (! &printint($ones)) {return 0;}
+ 		}
+ 		else {&pusho("hundred");}
+ 	}
+ 	else
+-	{	&printint($intpart);
++	{	if (! &printint($intpart)) {return 0;}
+ 		$y=$last;
+ 		$y =~ s/^\W*//;				# thize dates: May 25th
+ 		if(length($intpart)<=2 && $months{$y})
+-		{	&thize("");
++		{	if (! &thize("")) {return 0;}
+ 			$back =~ s/[a-z]//g;
+ 		}
+ 	}
+-	if($fracpart) {&printdecfrac($fracpart);}
++	if($fracpart) {if (! &printdecfrac($fracpart)) {return 0;} }
+ 
+ 	if($back)
+ 	{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
+-		{	&pluralize;			# eg. 1960s
++		{	if (! &pluralize) {return 0;}			# eg. 1960s
+ 			$back =~ s/^s//;
+ 		}
+ 		if($back =~ /^st$/ || $back =~ /^st\W/)	# back= st
+-		{	&thize("st");			# eg. 1st
++		{	if (! &thize("st")) {return 0;}			# eg. 1st
+ 			$back =~ s/^st//;
+ 		}
+ 		if($back =~ /^nd$/ || $back =~ /^nd\W/)	# back= nd
+-		{	&thize("nd");			# eg. 2nd
++		{	if (! &thize("nd")) {return 0;}			# eg. 2nd
+ 			$back =~ s/^nd//;
+ 		}
+ 		if($back =~ /^rd$/ || $back =~ /^rd\W/)	# back= rd
+-		{	&thize("rd");			# eg. 3rd
++		{	if (! &thize("rd")) {return 0;}			# eg. 3rd
+ 			$back =~ s/^rd//;
+ 		}
+ 		if($back =~ /^th$/ || $back =~ /^th\W/)	# back= th
+-		{	&thize("th");			# eg. 4th
++		{	if (! &thize("th")) {return 0;}			# eg. 4th
+ 			$back =~ s/^th//;
+ 		}
+ 		if($back)
+@@ -724,6 +730,7 @@
+ 			else {&appendo($back);}	# back = punct or "'s"
+ 		}
+ 	}
++  return 1;
+ }
+ 
+ sub printdate			# printdate(n):	x/x/x format
+@@ -741,7 +748,7 @@
+ 	$back=$1;
+ 
+ 	if($x !~ /^\d{1,2}\/\d{1,2}\/(19)?\d{2}$/)
+-		{&perr("printdate: $_[0] is not a date");}
++		{&perr("printdate: $_[0] is not a date"); return 0;}
+ 
+ 	@y=split(/\//,$x);
+ 	$y[2] =~ s/^19(\d{2})$/$1/;
+@@ -752,20 +759,21 @@
+ 		$appendflg=1;
+ 	}
+ 
+-	&printint($y[0]);
++	if (! &printint($y[0])) {return 0;}
+ 	&appendo("/");
+ 
+ 	$appendflg=1;
+-	&printint($y[1]);
++	if (! &printint($y[1])) {return 0;}
+ 	&appendo("/");
+ 
+ 	$appendflg=1;
+-	&printint($y[2]);
++	if (! &printint($y[2])) {return 0;}
+ 
+ 	if($back)
+ 	{	if($back =~ /^[a-zA-Z]/) {&appendo("-");}
+ 		&appendo($back);
+ 	}
++  return 1;
+ }
+ 
+ sub printserno			# printserno(n): eg. B1, 3b2, 10W-40
+@@ -815,12 +823,12 @@
+ 		}		     # (should expand here unless in dictionary)
+ 		$x =~ s/^(\d*)//;	# strip off dig
+ 		$y=$1;
+-		if($y ne "") { &printdigstr($y); }
++		if($y ne "") { if (! &printdigstr($y)) {return 0;} }
+ 	}
+ 
+ 	if($back =~ /^s\b/)	# back = s
+ 	{			# eg. 2C60s
+-	    &pluralize;
++	    if (! &pluralize) {return 0;} 
+ 	    $back =~ s/^s//;
+ 	}
+ 	if($back)
+@@ -828,6 +836,7 @@
+ 		else {&appendo($back);}
+ 	}
+ 	$appendflg=0;
++  return 1;
+ }
+ 
+ sub printdigstr			# printdigstr(x)
+@@ -841,14 +850,13 @@
+ 	if($x =~ /^0/)			# leading zero
+ 	{	while($x ne "")
+ 		{	$x =~ s/^(.)//;
+-			if($1 !~ /\d/) {&perr("printdigstr: non-digit");}
++			if($1 !~ /\d/) {&perr("printdigstr: non-digit"); return 0;}
+ 			&pusho("$ones_z[$1]");
+ 		}
+ 		return;
+ 	}
+ 	if($x =~ /^\d0*$/)		# d, d0, d00, d000, etc
+-	{	&printint($x);
+-		return;
++	{	return &printint($x);
+ 	}
+ 
+ 	$_=$x;
+@@ -857,30 +865,29 @@
+ 	for($k=0;$y[$k]==0;$k++) {}			# k= nr following 0s
+ 
+ 	if($j==2)			# 2 dig
+-	{	&printint($x);
+-		return;
++	{	return &printint($x);
+ 	}
+ 	if($j==3)
+-	{	&printint($y[2]);
++	{	if (! &printint($y[2])) {return 0;}
+ 		if($y[1]==0) {&pusho("oh");}
+-		&printint("$y[1]$y[0]");
+-		return;
++		return &printint("$y[1]$y[0]");
+ 	}
+ 	if($j==5 && $k<=2)
+-	{	&printint("$y[4]");
++	{	if (! &printint("$y[4]")) {return 0;}
+ 		$j=4;
+ 	}
+ 	if($j==4)
+-	{	&printint("$y[3]$y[2]");
++	{	if (! &printint("$y[3]$y[2]")) {return 0;}
+ 		if($k==2) {&pusho("hundred");}
+ 		else
+ 		{	if($y[1]==0) {&pusho("oh");}
+-			&printint("$y[1]$y[0]");
++			return &printint("$y[1]$y[0]");
+ 		}
+-		return;
++		return 1;
+ 	}
+ 						# >5 dig: just sequential dig
+ 	for($j--;$j>=0;$j--) {&pusho("$ones_oh[$y[$j]]");}
++  return 1;
+ }
+ 
+ sub printftin			# printftin(n): eg. 6\'-4\"
+@@ -905,19 +912,19 @@
+ 
+ 	$x =~ s/^([\d\.]*)//;	# strip off dig & .
+ 	$y=$1;
+-	if(!$y) {&perr("printftin: bad feet");}
+-	&printnum($y);
++	if(!$y) {&perr("printftin: bad feet"); return 0;}
++	if (! &printnum($y)) {return 0;}
+ 	if($y==1) {&appendo("-foot");}
+ 	else {&appendo("-feet");}
+ 
+ 	$x =~ s/^\'//;	# strip off \'
+ 	$x =~ s/^-//;	# strip off -
+-	if(!$x) {&perr("printftin: bad intermed");}
++	if(!$x) {&perr("printftin: bad intermed"); return 0;}
+ 
+ 	$x =~ s/^([\d\.]*)//;	# strip off dig & .
+ 	$y=$1;
+-	if(!$y) {&perr("printftin: bad inches");}
+-	&printnum($y);
++	if(!$y) {&perr("printftin: bad inches"); return 0;}
++	if (! &printnum($y)) {return 0;}
+ 	if($y==1) {&appendo("-inch");}
+ 	else {&appendo("-inches");}
+ 
+@@ -925,6 +932,7 @@
+ 	{	if($back !~ /^[a-zA-Z]/) {&appendo($back);}
+ 		else {&pusho($back);}
+ 	}
++  return 1;
+ }
+ 
+ sub printint			# printint(x)
+@@ -968,13 +976,14 @@
+ 			}
+ 			if(int($j/3)>0)
+ 			{	if(int($j/3) > $#mult)
+-					{ &perr("printint: too big"); }
++					{ &perr("printint: too big"); return 0;}
+ 				&pusho($mult[int($j/3)]);
+ 			}
+ 			$commanextflg=1;
+ 		}
+ 	}
+ 	$commanextflg=0;
++  return 1;
+ }
+ 
+ sub printdecfrac
+@@ -989,6 +998,8 @@
+ 	if($leadingzeroflg)
+ 		{for($j=0;$j<=$#y;$j++) { &pusho($ones_z[$y[$j]]);}}
+ 	else {for($j=0;$j<=$#y;$j++) { &pusho($ones_oh[$y[$j]]);}}
++
++  return 1;
+ }
+ 
+ sub pluralize		# pluralize(): pluralize last entry on output stack
+@@ -1016,7 +1027,9 @@
+ 		$x =~ s/y$/ies/;
+ 		&pusho($x);
+ 	}
+-	else {&perr("pluralize: unknown word: $_");}
++	else {&perr("pluralize: unknown word: $_"); return 0;}
++
++  return 1;
+ }
+ 
+ sub thize		# thize(): add th to last entry on output stack
+@@ -1028,50 +1041,51 @@
+ 	$_=&geto;
+ 	if( /four$/ || /six$/ || /seven$/ || /ten$/ ||
+ 		/eleven$/ || /een$/ || /hundred$/ || /thousand$/ || /illion$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # xth
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # xth
+ 		&appendo("th");
+ 	}
+ 	elsif( /one$/ )						# 1st
+-	{	if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/one$/first/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /two$/ )						# 2nd
+-	{	if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/two$/second/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /three$/ )					# 3rd
+-	{	if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/three$/third/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /five$/ || /twelve$/ )				# 5th, 12th
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/ve$/fth/;
+ 		&pusho($x);
+ 	}
+ 	elsif(/eight$/)
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # 8th
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # 8th
+ 		&appendo("h");
+ 	}
+ 	elsif( /nine$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/nine$/ninth/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /ty$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/ty$/tieth/;
+ 		&pusho($x);
+ 	}
+-	else {&perr("thize: unknown word: $_");}
++	else {&perr("thize: unknown word: $_"); return 0;j}
++  return 1;
+ }
+ 
+ sub pusho				# pusho($x): push output
+@@ -1089,17 +1103,17 @@
+ sub appendo				# appendo($x): append to output
+ {	$appendflg=0;		
+ #	if($#output < 0) {&pusho("");}
+-	if($#output < 0) {&perr("appendo: output empty");}
++	if($#output < 0) {&perr("appendo: output empty"); return 0;}
+ 	$output[$#output] .= @_[0];
+ }
+ 
+ sub popo				# popo(): pop last output
+-{	if($#output < 0) {&perr("popo: output empty");}
++{	if($#output < 0) {&perr("popo: output empty"); return 0;}
+ 	pop(@output);
+ }
+ 
+ sub geto				# geto(): get last output
+-{	if($#output < 0) {&perr("geto: output empty");}
++{	if($#output < 0) {&perr("geto: output empty"); return 0;}
+ 	return $output[$#output];
+ }
+ 
+@@ -1111,8 +1125,6 @@
+ 	$appendflg=0;
+ 	$commanextflg=0;
+ 	&pusho($this);
+-	$field++;		# graceful error recovery
+-	goto wloop;
+ }
+ 
+ sub perr2
+diff -Naur tools/csr4_utils/pare-sgml.perl local/data_prep/csr_hub4_utils/pare-sgml.perl
+--- tools/csr4_utils/pare-sgml.perl	1996-08-27 15:25:17.000000000 -0400
++++ local/data_prep/csr_hub4_utils/pare-sgml.perl	2017-11-03 13:22:09.486213159 -0400
+@@ -1,11 +1,14 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: pare-sgml.perl,v 1.3 1996/08/15 02:51:17 robertm Rel $
+ # removes extraneous headers and other non-LM fields
+ # translates <DOC ...> into LM-standard <art ...>
+ # removes comments (enclosed in brackets)
+ 
+-$intext=0;
++use strict;
++use warnings;
++
++my $intext=0;
+ while (<>)
+ {
+     if ($intext == 0)
+diff -Naur tools/csr4_utils/process_filelist.sh local/data_prep/csr_hub4_utils/process_filelist.sh
+--- tools/csr4_utils/process_filelist.sh	1969-12-31 19:00:00.000000000 -0500
++++ local/data_prep/csr_hub4_utils/process_filelist.sh	2017-11-03 13:22:09.490213160 -0400
+@@ -0,0 +1,30 @@
++#! /bin/bash
++
++set -e 
++set -o pipefail
++set -u
++set -x
++
++if [ $# -ne 2 ]; then
++  echo "Usage: $0 <filelist> <dir>"
++  exit 1
++fi
++
++filelist=$1
++dir=$2
++
++export PATH=local/data_prep/csr_hub4_utils:$PATH
++
++for file in `cat $filelist`; do
++	BASENM=`basename $file`
++  name="${BASENM%.*}"
++
++	echo "Running LM pipeline for |$BASENM|..." 1>&2
++  gunzip -c $file | pare-sgml.perl | \
++    bugproc.perl | \
++    numhack.perl | \
++    numproc.perl -xlocal/data_prep/csr_hub4_utils/num_excp | \
++    abbrproc.perl local/data_prep/csr_hub4_utils/abbrlist | \
++    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
++	echo "Done with $BASENM."
++done
+diff -Naur tools/csr4_utils/progsummary.perl local/data_prep/csr_hub4_utils/progsummary.perl
+--- tools/csr4_utils/progsummary.perl	1996-07-12 09:26:35.000000000 -0400
++++ local/data_prep/csr_hub4_utils/progsummary.perl	2017-11-03 13:22:09.494213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # Program:	progsummary.perl
+ # Written by:	dave graff
+diff -Naur tools/csr4_utils/puncproc.perl local/data_prep/csr_hub4_utils/puncproc.perl
+--- tools/csr4_utils/puncproc.perl	1996-08-27 15:25:17.000000000 -0400
++++ local/data_prep/csr_hub4_utils/puncproc.perl	2017-11-03 13:22:09.494213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: puncproc.perl,v 1.2 1996/08/05 16:12:42 robertm Rel $
+ ###############################################################################
+@@ -59,7 +59,7 @@
+ 						# forbidden symbols
+ 	if(/</) {&perr("<");}				# <
+ 	if(/>/) {&perr(">");}				# >
+-	if(/\$/) {&perr("$");}				# $
++	if(/\$/) {&perr("\$");}				# $
+ 	if(/_/) {&perr("_");}				# _
+ 	if(/\d/) {&perr("[0-9]");}			# 0-9
+ 
+diff -Naur tools/csr4_utils/tr-bn-char.fast.perl local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
+--- tools/csr4_utils/tr-bn-char.fast.perl	1996-08-21 02:39:12.000000000 -0400
++++ local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl	2017-11-03 13:22:09.502213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl -pi.old-char
++#!/usr/bin/perl -pi.old-char
+ 
+ # handles nonprinting characters in Broadcast News material, to the extent
+ # that they can be handled, and perhaps a bit beyond...
+diff -Naur tools/csr4_utils/tr-bn-char.slow.perl local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
+--- tools/csr4_utils/tr-bn-char.slow.perl	1996-08-21 01:30:18.000000000 -0400
++++ local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl	2017-11-03 13:22:09.502213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl -p
++#!/usr/bin/perl -p
+ 
+ # handles nonprinting characters in Broadcast News material, to the extent
+ # that they can be handled, and perhaps a bit beyond...
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/INVENTORY b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/INVENTORY
deleted file mode 100644
index 73229812231..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/INVENTORY
+++ /dev/null
@@ -1,56 +0,0 @@
-INVENTORY
-	This file, a short description of included tools.
-README
-	Introduction to the tools.
-abbrlist
-abbrproc.perl
-	Part of LM conditioning pipeline.
-	Spells out abbreviations and such.
-	"abbrlist" is an auxiliary data file for abbrproc.
-addressforms
-	auxiliary file used by "sentag.c"
-artfilter.perl
-	Selects articles based on content of SGML tags.
-bugproc.perl
-	Part of LM conditioning pipeline.
-	Corrects a few common typos and non-standard spellings.
-do-lm
-	Bourne shell script that executes language modeling conditioning
-	pipeline.
-eval-material.ptrns
-	Pattern file used to separate reserved "test" (evaluation) articles
-	from "train" articles (training material).  Used with "artfilter"
-	program along the following lines:
-	    foreach $file
-		artfilter.perl -t program -f eval-material.ptrns -v -r \
-		 $file.test $file > $file.train
-num_excp
-numhack.perl
-numproc.perl
-	Part of LM conditioning pipeline.
-	Spells out numberical expressions.
-	"num_excp" is an auxiliary data file for numproc.
-	"numhack.perl" is a new module for phone numbers and zip codes.
-pare-sgml.perl
-	Part of LM conditioning pipeline.
-	Removes extraneous SGML tagging and transcriber comments enclosed
-	in brackets.
-progsummary.perl
-	extracts program information from sgml-ized PSM texts
-puncproc.perl
-	Part of LM conditioning pipeline.
-	Verbalizes punctuation (or removes, with -np switch).
-sent-init.vocab
-sentag.c
-	Program used to tag sentences in "raw" version.
-	Revised since last CSR_LM95 to handle over-long
-	sentences/paragraphs and to pass material lacking any obvious
-	end-of-sentence markers or alphabetic characters, since the
-	transcriptions are more likely to contain such text.  Uses
-	auxiliary "sent-init.vocab" file.
-tr-bn-char.fast.perl
-tr-bn-char.slow.perl
-	Program used to translate 8-bit character encoding occasionally
-	found in the documents.  The two versions should be identical in
-	output; the "slow" version is more readable while the "fast"
-	version is more efficient.
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/README b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/README
deleted file mode 100644
index fa73f3a4dc3..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/README
+++ /dev/null
@@ -1,34 +0,0 @@
-SOFTWARE ACCOMPANYING CSR LM DATA
----------------------------------
-
-The files in this directory fall into three categories:
-
-(1) C source code (*.c)
-
-(2) Perl source code (*.perl)
-
-(3) shell scripts and auxiliary data files
-
-In general, program summaries are provided within the source files, and are
-often printed on stderr by the programs when the user enters some
-unsuitable command line option (e.g. -h).  The "do-lm" shell script shows
-the components that were used in the LM-conditioning pipeline; other perl
-programs were used for data summaries and correction of minor glitches.
-See INVENTORY for more information.
-
-For further information on these programs, please contact Robert MacIntyre
-or David Graff at the Linguistic Data Consortium:
-  robertm@ldc.upenn.edu, (215) 573-5491
-  graff@ldc.upenn.edu, (215) 898-0887
-
-While disclaimers have not been systematically placed in all source
-code files, users are expected to understand that the following
-applies to all source code files in this directory, unless otherwise
-noted in particular files:
-
-This software is being provided by the Linguistic Data Consortium, and
-the University of Pennsylvania, without any guarantee, warrantee or
-implication about its correctness, usefulness or suitability to any
-purpose.  You may copy, modify and redistribute it, but you may not
-hold the LDC or Univ. of Penn. responsible for any damages resulting
-from its use.
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrlist b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrlist
deleted file mode 100644
index 0c15bbd2eb5..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrlist
+++ /dev/null
@@ -1,2403 +0,0 @@
-###############################################################################
-# This software is being provided to you, the LICENSEE, by the Massachusetts  #
-# Institute of Technology (M.I.T.) under the following license.  By           #
-# obtaining, using and/or copying this software, you agree that you have      #
-# read, understood, and will comply with these terms and conditions:          #
-#                                                                             #
-# Permission to use, copy, modify and distribute, including the right to      #
-# grant others the right to distribute at any tier, this software and its     #
-# documentation for any purpose and without fee or royalty is hereby granted, #
-# provided that you agree to comply with the following copyright notice and   #
-# statements, including the disclaimer, and that the same appear on ALL       #
-# copies of the software and documentation, including modifications that you  #
-# make for internal use or for distribution:                                  #
-#                                                                             #
-# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
-# reserved.                                                                   #
-#                                                                             #
-# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
-# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
-# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
-# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
-# TRADEMARKS OR OTHER RIGHTS.                                                 #
-#                                                                             #
-# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
-# used in advertising or publicity pertaining to distribution of the          #
-# software.  Title to copyright in this software and any associated           #
-# documentation shall at all times remain with M.I.T., and USER agrees to     #
-# preserve same.                                                              #
-###############################################################################
-
-# abbreviation list
-# derived from unigram file 29 Aug 91 mods to 17 Sept 91
-# x.y. mapped to x. y. in program
-
-# true abbreviations (must end with .)
-# if key includes lower case, an upper case version will be created
-Adm.	Admiral
-Ala.	Alabama
-Alex.	Alexander
-Apr.	April
-Ariz.	Arizona
-Ark.	Arkansas
-AUG.	AUGUST
-Aug.	August
-Ave.	Avenue
-Bancorp.	Bancorp
-Bhd.	B. H. D.
-Blvd.	Boulevard
-Brig.	Brigadeer
-Bros.	Brothers
-Cal.	Calorie
-Ca.	California
-Calif.	California
-Capt.	Captain
-Cie.	Company
-Cmdr.	Commander
-Co.	Company
-co.	Company
-Col.	Colonel
-Colo.	Colorado
-Conn.	Connecticut
-Corp.	Corporation
-Cos.	Companies
-Cpl.	Corporal
-Dec.	December
-Del.	Delaware
-Dept.	Department
-Dr.	Doctor
-Drs.	Doctors
-Feb.	February
-Fla.	Florida
-Fr.	Friar
-Fri.	Friday
-Ft.	Fort
-Ga.	Georgia
-Gen.	General
-Gov.	Governor
-Ill.	Illinois
-Inc.	Incorporated
-Ind.	Indiana
-InfoCorp.	InfoCorp
-Infocorp.	InfoCorp
-Intercorp.	Intercorp
-Jan.	January
-Jr.	Junior
-Jul.	July
-Jun.	June
-Kan.	Kansas
-Ky.	Kentucky
-La.	Louisiana
-lb.	pound
-lbs.	pounds
-Lt.	Lieutenant
-Ltd.	Limited
-Ltda.	Company
-Maj.	Major
-Mar.	March
-Mass.	Massachusetts
-MCorp.	M. Corporation
-Md.	Maryland
-Me.	Maine
-# Some Italian company
-Me.T.A.	M. E. T. A.
-Mfg.	Manufacturing
-Mich.	Michigan
-Minn.	Minnesota
-Miss.	Mississippi
-Mo.	Missouri
-Mt.	Mountain
-Mont.	Montana
-#			meaning of mistress has changed + symmetry
-#Mr.	Mister
-#Mrs.	Mistress
-#Ms.	Miz
-#Messrs.
-#
-Neb.	Nebraska
-Nev.	Nevada
-No.	Number
-Nos.	Numbers
-Nov.	November
-Oct.	October
-Okla.	Oklahoma
-Ont.	Ontario
-Op.	Opus
-Ore.	Oregon
-Pa.	Pennsylvania
-PacifiCorp.	PacifiCorp
-Penn.	Pennsylvania
-PHLCorp.	P. H. L. Corporation
-Ph.D.	P. H. D.
-PhD.	P. H. D.
-Prof.	Professor
-Prop.	Proposition
-Pte.	Point
-Pty.	Party
-Pvt.	Private
-Rep.	Representative
-Reps.	Representatives
-Rev.	Reverend
-Sen.	Senator
-Sens.	Senators
-Sept.	September
-Sgt.	Sargent
-S.p.A.	Company
-Sr.	Senior
-#St.	Street or Saint		Context dependent (in abbrevproc)
-Ste.	Saint
-Tel.	Telephone
-Tenn.	Tennessee
-Tex.	Texas
-Va.	Virginia
-Vt.	Vermont
-W.Va.	West Virginia
-Wash.	Washington
-Wis.	Wisconsin
-Wyo.	Wyoming
-Yr.	Year
-etc.	et-cetera
-Etc.	Et-cetera
-ft.	feet
-inc.	incorporated
-mfg.	manufacturing
-vs.	versus
-
-# left contexts for roman cardinal numerals
-# case independent comparisons
-*r	Act
-*r	Advantage
-*r	amendment
-*r	angiotensin
-*r	Antrim
-*r	Appendix
-*r	Apple
-*r	Arrow
-*r	Article
-*r	Associates
-*r	Astros
-*r	Bank
-*r	Bowl
-*r	Bronco
-*r	Busch
-*r	CSPAN
-*r	Canada
-*r	Century
-*r	Class
-*r	Cleopatra
-*r	Concepts
-*r	Cop
-*r	dBase
-*r	database
-*r	Delta
-*r	Detente
-*r	Dundee
-*r	Esprit
-*r	Explorer
-*r	Express
-*r	Eyes
-*r	Factor
-*r	Ford
-*r	Freaks
-*r	Fund
-*r	Funding
-*r	Funds
-*r	Future
-*r	GOD
-*r	GSTAR
-*r	Gemini
-*r	Ghostbusters
-*r	Global
-*r	Group
-*r	Gulfstream
-*r	Hybrid
-*r	Intelsat
-*r	Investment
-*r	Investments
-*r	Iron
-*r	Jets
-*r	Journalism
-*r	Kong
-# 		LaSalle nuclear plant
-*r	LaSalle
-*r	LaserWriter
-*r	Lighthouse
-*r	Linen
-*r	Mark
-*r	Mac
-*r	MacDraw
-*r	MacProject
-*r	Macintosh
-*r	Management
-*r	Mark
-*r	Metro
-*r	MicroVAX
-*r	Minuteman
-*r	Monopoly
-*r	Notes
-*r	numeral
-*r	OPEC
-*r	Officer
-*r	Overseas
-*r	Part
-*r	Partners
-*r	Pershing
-*r	Phantasm
-*r	Phase
-*r	Phobos
-*r	Pioneer
-*r	Pirate
-*r	Play
-*r	Plus
-*r	Point
-*r	Portable
-*r	Quick
-*r	Rambo
-*r	Ransom
-*r	Resorts
-*r	SALT
-*r	Screen
-*r	Series
-*r	Stage
-*r	Superman
-*r	System
-*r	TIAA
-*r	Titan
-*r	Title
-*r	Toxic
-*r	Trac
-*r	Trek
-*r	Trident
-*r	Trooper
-*r	Trust
-*r	Ultima
-*r	Vatican
-*r	Ventures
-*r	Volume
-*r	WW
-*r	War
-*r	Weapon
-*r	Wespac
-*r	Westar
-*r	Wrestlemania
-
-# Roman ordinals (I, V, and X not included: too many false hits)
-II	the second
-III	the third
-IV	the fourth
-VI	the sixth
-VII	the seventh
-VIII	the eighth
-IX	the ninth
-XI	the eleventh
-XII	the twelfth
-XIII	the thirteenth
-XIV	the fourteenth
-XV	the fifteenth
-XVI	the sixteenth
-XVII	the seventeenth
-XVIII	the eighteenth
-XIX	the nineteenth
-XX	the twentieth
-XXI	the twenty-first
-XXII	the twenty-second
-XXIII	the twenty-third
-XXIV	the twenty-fourth
-XXV	the twenty-fifth
-
-# acronyms (not ending in .) needing translation
-# if key includes lower case, an upper case version will be created
-# keys can include - / & .
-AA	Double A.
-AAA	Triple A.
-AAI	A. A. I.
-AAP	A. A. P.
-AAR	A. A. R.
-AARP	A. A. R. P.
-AAS	A. A. S.
-AB	A. B.
-ABA	A. B. A.
-ABB	A. B. B.
-ABC	A. B. C.
-ABD	A. B. D.
-ABF	A. B. F.
-ABI	A. B. I.
-ABM	A. B. M.
-ABN	A. B. N.
-ABS	A. B. S.
-ABT	A. B. T.
-AC	A. C.
-ACA	A. C. A.
-ACC	A. C. C.
-ACCT	A. C. C. T.
-ACEC	A. C. E. C.
-ACF	A. C. F.
-ACI	A. C. I.
-ACLI	A. C. L. I.
-ACLU	A. C. L. U.
-ACM	A. C. M.
-ACO	A. C. O.
-ACP	A. C. P.
-ACSH	A. C. S. H.
-ACTV	A. C. T. V.
-ADB	A. D. B.
-ADC	A. D. C.
-ADI	A. D. I.
-ADIA	A. D. I. A.
-ADM	A. D. M.
-ADN	A. D. N.
-ADP	A. D. P.
-ADR	A. D. R.
-ADT	A. D. T.
-ADV	A. D. V.
-adv	A. D. V.
-AD&P	A. D. & P.
-AD/SAT	AD / SAT
-AE	A. E.
-AEA	A. E. A.
-AEC	A. E. C.
-AEG	A. E. G.
-AEI	A. E. I.
-AEL	A. E. L.
-AEP	A. E. P.
-AER	A. E. R.
-AES	A. E. S.
-AEU	A. E. U.
-AEW	A. E. W.
-AFA	A. F. A.
-AFC	A. F. C.
-AFCO	A. F. C. O.
-AFDC	A. F. D. C.
-AFG	A. F. G.
-AFGE	A. F. G. E.
-AFIS	A. F. I. S.
-AFL	A. F. L.
-AFP	A. F. P.
-AFSCME	A. F. S. C. M. E.
-AG	A. G.
-AGA	A. G. A.
-AGB	A. G. B.
-AGEF	A. G. E. F.
-AGF	A. G. F.
-AGI	A. G. I.
-AGIP	A. G. I. P.
-AGS	A. G. S.
-AGT	A. G. T.
-AHA	A. H. A.
-AHL	A. H. L.
-AI	A. I.
-AIBD	A. I. B. D.
-AIC	A. I. C.
-AICPA	A. I. C. P. A.
-AIFS	A. I. F. S.
-AIG	A. I. G.
-AIL	A. I. L.
-AIME	A. I. M. E.
-AIT	A. I. T.
-AIW	A. I. W.
-AIX	A. I. X.
-AK	A. K.
-AKA	A. K. A.
-ALC	A. L. C.
-ALQ	A. L. Q.
-ALR	A. L. R.
-AM	A. M.
-AMA	A. M. A.
-AMC	A. M. C.
-AMCA	A. M. C. A.
-AMCC	A. M. C. C.
-AMD	A. M. D.
-AME	A. M. E.
-AMF	A. M. F.
-AMG	A. M. G.
-AMI	A. M. I.
-AML	A. M. L.
-AMO	A. M. O.
-AMP	A. M. P.
-AMR	A. M. R.
-AMT	A. M. T.
-ANB	A. N. B.
-ANC	A. N. C.
-ANF	A. N. F.
-ANMC	A. N. M. C.
-ANR	A. N. R.
-ANWR	A. N. W. R.
-ANZ	A. N. Z.
-AO	A. O.
-AOC	A. O. C.
-AOI	A. O. I.
-AOK	A. O. K.
-AON	A. O. N.
-AP	A. P.
-A&P	A. & P.
-APA	A. P. A.
-APAC	A. P. A. C.
-API	A. P. I.
-APL	A. P. L.
-APMA	A. P. M. A.
-APN	A. P. N.
-APPWP	A. P. P. W. P.
-APR	A. P. R.
-APS	A. P. S.
-APSAC	A. P. S. A. C.
-APV	A. P. V.
-APW	A. P. W.
-ARA	A. R. A.
-ARB	A. R. B.
-ARD	A. R. D.
-ARX	A. R. X.
-ASA	A. S. A.
-ASB	A. S. B.
-ASC	A. S. C.
-ASEA	A. S. E. A.
-ASI	A. S. I.
-ASPCA	A. S. P. C. A.
-AST	A. S. T.
-AT	A. T.
-ATA	A. T. A.
-ATC	A. T. C.
-ATF	A. T. F.
-ATI	A. T. I.
-ATM	A. T. M.
-ATN	A. T. N.
-ATR	A. T. R.
-ATS	A. T. S.
-AT&T	A. T. & T.
-ATV	A. T. V.
-AUS	A. U. S.
-AV	A. V.
-AVAQ	A. V. A. Q.
-AVC	A. V. C.
-AVX	A. V. X.
-AWA	A. W. A.
-AWD	A. W. D.
-AWOL	A. W. O. L.
-AWSJ	A. W. S. J.
-AWT	A. W. T.
-AXA	A. X. A.
-AXP	A. X. P.
-AY	A. Y.
-AZL	A. Z. L.
-AZP	A. Z. P.
-AZT	A. Z. T.
-BA	B. A.
-Ba	B. a.
-BAA	B. A. A.
-Baa	B. a. a.
-BAC	B. A. C.
-BAII	B. A. I. I.
-B.A.IT	B. A. IT
-BASF	B. A. S. F.
-B.A.T	B. A. T.
-BB	Double B.
-BBA	B. B. A.
-BBB	Triple B.
-BBC	B. B. C.
-BBDO	B. B. D. O.
-BBN	B. B. N.
-BC	B. C.
-BCA	B. C. A.
-BCCI	B. C. C. I.
-BCE	B. C. E.
-BCEAO	B. C. E. A. O.
-BCG	B. C. G.
-BCI	B. C. I.
-BCM	B. C. M.
-BCOA	B. C. O. A.
-BCS	B. C. S.
-BCV	B. C. V.
-BCW	B. C. W.
-BDC	B. D. C.
-BDDP	B. D. D. P.
-BDM	B. D. M.
-BDO	B. D. O.
-BDR	B. D. R.
-BEC	B. E. C.
-BEI	B. E. I.
-BF	B. F.
-BFEA	B. F. E. A.
-BFS	B. F. S.
-BGH	B. G. H.
-BGS	B. G. S.
-BHC	B. H. C.
-Bhd	B. H. D.
-BHF	B. H. F.
-BHP	B. H. P.
-BHS	B. H. S.
-BHW	B. H. W.
-BI	B. I.
-BIA	B. I. A.
-BICC	B. I. C. C.
-BiiN	B. i. i. N.
-BIP	B. I. P.
-BIR	B. I. R.
-BIS	B. I. S.
-BIW	B. I. W.
-BJ	B. J.
-BJF	B. J. F.
-BK	B. K.
-BL	B. L.
-BLM	B. L. M.
-BLS	B. L. S.
-BM	B. M.
-BMA	B. M. A.
-BMC	B. M. C.
-BMI	B. M. I.
-BMP	B. M. P.
-BMW	B. M. W.
-BMY	B. M. Y.
-BN	B. N.
-BNL	B. N. L.
-BNP	B. N. P.
-BNS	B. N. S.
-BNY	B. N. Y.
-BOC	B. O. C.
-BOJ	B. O. J.
-BOT	B. O. T.
-BP	B. P.
-bpd	B. P. D.
-BPB	B. P. B.
-BPC	B. P. C.
-BPCA	B. P. C. A.
-BPCC	B. P. C. C.
-BPD	B. P. D.
-BPI	B. P. I.
-BR	B. R.
-BRE	B. R. E.
-BRNF	B. R. N. F.
-BRT	B. R. T.
-BRZ	B. R. Z.
-BS	B. S.
-BSB	B. S. B.
-BSD	B. S. D.
-BSE	B. S. E.
-BSI	B. S. I.
-BSN	B. S. N.
-BSO	B. S. O.
-BST	B. S. T.
-BT	B. T.
-BTL	B. T. L.
-BTR	B. T. R.
-BTU	B. T. U.
-BV	B. V.
-BVI	B. V. I.
-BVL	B. V. L.
-BW	B. W.
-BWA	B. W. A.
-BWAC	B. W. A. C.
-BZ	B. Z.
-BZW	B. Z. W.
-CA	C. A.
-Ca	C. a.
-CAA	C. A. A.
-Caa	C. a. a.
-CAAC	C. A. A. C.
-CAC	C. A. C.
-CACI	C. A. C. I.
-CAD	C. A. D.
-CAE	C. A. E.
-CAID	C. A. I. D.
-CAMI	C. A. M. I.
-CARU	C. A. R. U.
-CATV	C. A. T. V.
-CAV	C. A. V.
-CAW	C. A. W.
-CB	C. B.
-CBC	C. B. C.
-CBI	C. B. I.
-CBN	C. B. N.
-CBO	C. B. O.
-CBOE	C. B. O. E.
-CBOT	C. B. O. T.
-CBS	C. B. S.
-CBT	C. B. T.
-CBW	C. B. W.
-CCA	C. C. A.
-CCC	C. C. C.
-CCD	C. C. D.
-CCE	C. C. E.
-CCH	C. C. H.
-CCK	C. C. K.
-CCL	C. C. L.
-CCX	C. C. X.
-CD	C. D.
-CDA	C. D. A.
-CDC	C. D. C.
-CDF	C. D. F.
-CDI	C. D. I.
-CDL	C. D. L.
-CDS	C. D. S.
-CDT	C. D. T.
-CDU	C. D. U.
-CDW	C. D. W.
-CE	C. E.
-CEA	C. E. A.
-CED	C. E. D.
-CEE	C. E. E.
-CEI	C. E. I.
-CEL	C. E. L.
-CEO	C. E. O.
-CEP	C. E. P.
-CES	C. E. S.
-CF	C. F.
-CFA	C. F. A.
-CFC	C. F. C.
-CFM	C. F. M.
-CFO	C. F. O.
-CFP	C. F. P.
-CFS	C. F. S.
-CFTC	C. F. T. C.
-CFTR	C. F. T. R.
-CGB	C. G. B.
-CGCT	C. G. C. T.
-CGE	C. G. E.
-CGM	C. G. M.
-CGS	C. G. S.
-CGT	C. G. T.
-CH	C. H.
-CHC	C. H. C.
-CHG	C. H. G.
-CI	C. I.
-CIA	C. I. A.
-CIBC	C. I. B. C.
-CIC	C. I. C.
-CID	C. I. D.
-CIE	C. I. E.
-CIGS	C. I. G. S.
-CIM	C. I. M.
-CIO	C. I. O.
-CIP	C. I. P.
-CIR	C. I. R.
-CIS	C. I. S.
-CIT	C. I. T.
-CJ	C. J.
-CJI	C. J. I.
-CJM	C. J. M.
-CK	C. K.
-CL	C. L.
-CLC	C. L. C.
-CLS	C. L. S.
-CLU	C. L. U.
-CLX	C. L. X.
-CM	C. M.
-CMA	C. M. A.
-CMB	C. M. B.
-CMC	C. M. C.
-CME	C. M. E.
-CMF	C. M. F.
-CMI	C. M. I.
-CML	C. M. L.
-CMO	C. M. O.
-CMQ	C. M. Q.
-CMS	C. M. S.
-CMV	C. M. V.
-CMS	C. M. X.
-CN	C. N.
-CNA	C. N. A.
-CNB	C. N. B.
-CNBC	C. N. B. C.
-CNCL	C. N. C. L.
-CNCP	C. N. C. P.
-CNFR	C. N. F. R.
-CNG	C. N. G.
-CNN	C. N. N.
-CNOOC	C. N. O. O. C.
-CNW	C. N. W.
-Corp	Corporation
-CP	C. P.
-CPA	C. P. A.
-CPAC	C. P. A. C.
-CPB	C. P. B.
-CPC	C. P. C.
-CPE	C. P. E.
-CPI	C. P. I.
-CPL	C. P. L.
-CPM	C. P. M.
-CPP	C. P. P.
-CPR	C. P. R.
-CPSC	C. P. S. C.
-CPT	C. P. T.
-CQ	C. Q.
-CR	C. R.
-CRA	C. R. A.
-CRB	C. R. B.
-CRC	C. R. C.
-CRI	C. R. I.
-CRL	C. R. L.
-CRS	C. R. S.
-CRT	C. R. T.
-CRTC	C. R. T. C.
-CRX	C. R. X.
-CS	C. S.
-CSA	C. S. A.
-CSB	C. S. B.
-CSC	C. S. C.
-CSF	C. S. F.
-CSFB	C. S. F. B.
-CSI	C. S. I.
-CSIS	C. S. I. S.
-CSK	C. S. K.
-CSO	C. S. O.
-CSR	C. S. R.
-CSS	C. S. S.
-CST	C. S. T.
-CSU	C. S. U.
-CSV	C. S. V.
-CSX	C. S. X.
-CT	C. T.
-CTA	C. T. A.
-CTB	C. T. B.
-CTBS	C. T. B. S.
-CTC	C. T. C.
-CTG	C. T. G.
-CTI	C. T. I.
-CTK	C. T. K.
-CTM	C. T. M.
-CTS	C. T. S.
-CTV	C. T. V.
-CU	C. U.
-CUC	C. U. C.
-CVB	C. V. B.
-CVG	C. V. G.
-CVN	C. V. N.
-CVNY	C. V. N. Y.
-CVS	C. V. S.
-CW	C. W.
-CWA	C. W. A.
-CWB	C. W. B.
-CWT	C. W. T.
-CX	C. X.
-CXR	C. X. R.
-DAF	D. A. F.
-DAP	D. A. P.
-DAX	D. A. X.
-DB	D. B.
-DBA	D. B. A.
-DBI	D. B. I.
-DBL	D. B. L.
-DBS	D. B. S.
-DC	D. C.
-DCCC	D. C. C. C.
-DCI	D. C. I.
-DCNY	D. C. N. Y.
-DD	D. D.
-DDA	D. D. A.
-DDB	D. D. B.
-DDC	D. D. C.
-DDG	D. D. G.
-DDI	D. D. I.
-DDR	D. D. R.
-DDT	D. D. T.
-DEA	D. E. A.
-DEC	D. E. C.
-DES	D. E. S.
-DFA	D. F. A.
-DFC	D. F. C.
-DFMO	D. F. M. O.
-DFS	D. F. S.
-DG	D. G.
-DGA	D. G. A.
-DGPT	D. G. P. T.
-DH	D. H.
-DHB	D. H. B.
-DHL	D. H. L.
-DIA	D. I. A.
-DIW	D. I. W.
-DJ	D. J.
-DJIA	D. J. I. A.
-DJP	D. J. P.
-DJS	D. J. S.
-DKB	D. K. B.
-DKM	D. K. M.
-DL	D. L.
-DLC	D. L. C.
-DLJ	D. L. J.
-DM	D. M.
-DMA	D. M. A.
-DMB	D. M. B.
-DMC	D. M. C.
-DMD	D. M. D.
-DME	D. M. E.
-DMI	D. M. I.
-DMS	D. M. S.
-DMW	D. M. W.
-DMZ	D. M. Z.
-DN	D. N.
-DNA	D. N. A.
-DNC	D. N. C.
-DNX	D. N. X.
-DOC	D. O. C.
-DOD	D. O. D.
-DOE	D. O. E.
-DOS	D. O. S.
-DOT	D. O. T.
-DP	D. P.
-DPC	D. P. C.
-DPG	D. P. G.
-DPL	D. P. L.
-DPP	D. P. P.
-DPS	D. P. S.
-DPT	D. P. T.
-Dr	Doctor
-DRG	D. R. G.
-DRI	D. R. I.
-DS	D. S.
-DSA	D. S. A.
-DSC	D. S. C.
-DSL	D. S. L.
-DSLT	D. S. L. T.
-DSM	D. S. M.
-DSP	D. S. P.
-DST	D. S. T.
-DTC	D. T. C.
-DTH	D. T. H.
-DTI	D. T. I.
-DV	D. V.
-DVFA	D. V. F. A.
-DWG	D. W. G.
-DX	D. X.
-DYR	D. Y. R.
-EA	E. A.
-EAC	E. A. C.
-EAL	E. A. L.
-EAS	E. A. S.
-EB	E. B.
-EBDC	E. B. D. C.
-EBRD	E. B. R. D.
-EBS	E. B. S.
-EC	E. C.
-ECC	E. C. C.
-ECD	E. C. D.
-ECI	E. C. I.
-ECL	E. C. L.
-ECPA	E. C. P. A.
-ECU	E. C. U.
-EDA	E. D. A.
-EDB	E. D. B.
-EDC	E. D. C.
-EDI	E. D. I.
-EDM	E. D. M.
-EDP	E. D. P.
-EDS	E. D. S.
-EDT	E. D. T.
-EEC	E. E. C.
-EECO	E. E. C. O.
-EEI	E. E. I.
-EEOC	E. E. O. C.
-EEP	E. E. P.
-EES	E. E. S.
-EESP	E. E. S. P.
-EF	E. F.
-EFA	E. F. A.
-EFC	E. F. C.
-EG	E. G.
-EGA	E. G. A.
-EI	E. I.
-EIA	E. I. A.
-EIB	E. I. B.
-EIC	E. I. C.
-EIP	E. I. P.
-EITC	E. I. T. C.
-EIU	E. I. U.
-ELN	E. L. N.
-EMC	E. M. C.
-EMEA	E. M. E. A.
-EMI	E. M. I.
-EMS	E. M. S.
-EMT	E. M. T.
-ENI	E. N. I.
-ENSR	E. N. S. R.
-EP	E. P.
-EPA	E. P. A.
-EPLF	E. P. L. F.
-EPO	E. M. O.
-EPO	E. P. O.
-EPRI	E. P. R. I.
-ERC	E. R. C.
-ERG	E. R. G.
-ERIS	E. R. I. S.
-ERM	E. R. M.
-ERO	E. R. O.
-ERS	E. R. S.
-ES	E. S.
-ESA	E. S. A.
-ESB	E. S. B.
-ESI	E. S. I.
-ESL	E. S. L.
-ESOP	E. S. O. P.
-ESP	E. S. P.
-ESPN	E. S. P. N.
-ESS	E. S. S.
-EST	E. S. T.
-ET	E. T.
-ETA	E. T. A.
-ETBE	E. T. B. E.
-ETS	E. T. S.
-EU	E. U.
-EUA	E. U. A.
-EWE	E. W. E.
-EXL	E. X. L.
-EXP	E. X. P.
-EZ	E. Z.
-FA	F. A.
-FAA	F. A. A.
-FAC	F. A. C.
-FADA	F. A. D. A.
-FAI	F. A. I.
-FAO	F. A. O.
-FARC	F. A. R. C.
-FAS	F. A. S.
-FASB	F. A. S. B.
-FAZ	F. A. Z.
-FBI	F. B. I.
-FBS	F. B. S.
-FC	F. C.
-FCA	F. C. A.
-FCB	F. C. B.
-FCC	F. C. C.
-FCD	F. C. D.
-FCMI	F. C. M. I.
-FDA	F. D. A.
-FDC	F. D. C.
-FDIC	F. D. I. C
-FDIC	F. D. I. C.
-FDN	F. D. N.
-FDP	F. D. P.
-FDR	F. D. R.
-FEA	F. E. A.
-FEC	F. E. C.
-FEMA	F. E. M. A.
-FERC	F. E. R. C.
-FF	F. F.
-FFA	F. F. A.
-FFB	F. F. B.
-FFP	F. F. P.
-FGH	F. G. H.
-FGIC	F. G. I. C.
-FH	F. H.
-FHA	F. H. A.
-FHAA	F. H. A. A.
-FHFB	F. H. F. B.
-FHLB	F. H. L. B.
-FHLBB	F. H. L. B. B.
-FHP	F. H. P.
-FIA	F. I. A.
-FIAC	F. I. A. C.
-FICA	F. I. C. A.
-FICO	F. I. C. O.
-FIFA	F. I. F. A.
-FII	F. I. I.
-FIP	F. I. P.
-FK	F. K.
-FKB	F. K. B.
-FKI	F. K. I.
-FL	F. L.
-FLA	F. L. A.
-FLX	F. L. X.
-FM	F. M.
-FMC	F. M. C.
-FMHA	F. M. H. A.
-FmHA	F. M. H. A.
-FMI	F. M. I.
-FMLN	F. M. L. N.
-FMR	F. M. R.
-FMS	F. M. S.
-FN	F. N.
-FNN	F. N. N.
-FNS	F. N. S.
-FOMC	F. O. M. C.
-FP	F. P.
-FPA	F. P. A.
-FPC	F. P. C.
-FPCO	F. P. C. O.
-FPL	F. P. L.
-FR	F. R.
-FRA	F. R. A.
-FS	F. S.
-FSA	F. S. A.
-FSB	F. S. B.
-FSC	F. S. C.
-FSD	F. S. D.
-FSIA	F. S. I. A.
-FSLIC	F. S. L. I. C.
-FSLN	F. S. L. N.
-FSX	F. S. X.
-FT	F. T.
-FTC	F. T. C.
-FTS	F. T. S.
-FTSE	F. T. S. E.
-FX	F. X.
-FYI	F. Y. I.
-GA	G. A.
-GAAP	G. A. A. P.
-GAC	G. A. C.
-GAF	G. A. F.
-GAO	G. A. O.
-GASB	G. A. S. B.
-GATT	G. A. T. T.
-GATX	G. A. T. X.
-GB	G. B.
-GBL	G. B. L.
-GBM	G. B. M.
-GBS	G. B. S.
-GC	G. C.
-GCA	G. C. A.
-GCC	G. C. C.
-GCI	G. C. I.
-GDM	G. D. M.
-GDP	G. D. P.
-GDR	G. D. R.
-GE	G. E.
-GEC	G. E. C.
-GECC	G. E. C. C.
-GF	G. F.
-GFI	G. F. I.
-GFT	G. F. T.
-GGK	G. G. K.
-GHF	G. H. F.
-GHKM	G. H. K. M.
-GHR	G. H. R.
-GHS	G. H. S.
-GHRF	G. H. R. F.
-GI	G. I.
-GIA	G. I. A.
-GIC	G. I. C.
-GIS	G. I. S.
-GK	G. K.
-GKN	G. K. N.
-GL	G. L.
-GLCM	G. L. C. M.
-GLI	G. L. I.
-GM	G. M.
-GMA	G. M. A.
-GMAC	G. M. A. C.
-GMBH	G. M. B. H.
-GMC	G. M. C.
-GMF	G. M. F.
-GMHC	G. M. H. C.
-GMN	G. M. N.
-GMT	G. M. T.
-GMTV	G. M. T. V.
-GNB	G. N. B.
-GNI	G. N. I.
-GNMA	G. N. M. A.
-GNP	G. N. P.
-GOP	G. O. P.
-GP	G. P.
-GPA	G. P. A.
-GPD	G. P. D.
-GPG	G. P. G.
-GPO	G. P. O.
-GPS	G. P. S.
-GPT	G. P. T.
-GPU	G. P. U.
-GQ	G. Q.
-GR	G. R.
-GRE	G. R. E.
-GRI	G. R. I.
-GRU	G. R. U.
-GS	G. S.
-GSA	G. S. A.
-GSD	G. S. D.
-GSI	G. S. I.
-GSL	G. S. L.
-GSP	G. S. P.
-GSS	G. S. S.
-GST	G. S. T.
-GSX	G. S. X.
-GT	G. T.
-GTA	G. T. A.
-GTC	G. T. C.
-GTE	G. T. E.
-GTECH	G. Tech
-GTG	G. T. G.
-GTI	G. T. I.
-GTS	G. T. S.
-GV	G. V.
-GW	G. W.
-GWC	G. W. C.
-GXE	G. X. E.
-HBJ	H. B. J.
-HBM	H. B. M.
-HBO	H. B. O.
-HCA	H. C. A.
-HCC	H. C. C.
-HCI	H. C. I.
-HCFA	H. C. F. A.
-HCFC	H. C. F. C.
-HCS	H. C. S.
-HD	H. D.
-HDL	H. D. L.
-HDM	H. D. M.
-HDTV	H. D. T. V.
-HEI	H. E. I.
-HF	H. F.
-HFC	H. F. C.
-HG	H. G.
-HGTV	H. G. T. V.
-HH	H. H.
-HHB	H. H. B.
-HHS	H. H. S.
-HILB	H. I. L. B.
-HIV	H. I. V.
-HK	H. K.
-HKSAR	H. K. S. A. R.
-HL	H. L.
-HLM	H. L. M.
-HLX	H. L. X.
-HMA	H. M. A.
-HMDA	H. M. D. A.
-HMG	H. M. G.
-HMO	H. M. O.
-HMS	H. M. S.
-HMSS	H. M. S. S.
-HN	H. N.
-HNSX	H. N. S. X.
-HNV	H. N. V.
-HP	H. P.
-HPB	H. P. B.
-HQ	H. Q.
-HR	H. R.
-HRB	H. R. B.
-HRE	H. R. E.
-HRI	H. R. I.
-HRS	H. R. S.
-HSA	H. S. A.
-HSBC	H. S. B. C.
-HSH	H. S. H.
-HSST	H. S. S. T.
-HSV	H. S. V.
-HT	H. T.
-HTLV	H. T. L. V.
-HWC	H. W. C.
-HZN	H. Z. N.
-IADB	I. A. D. B.
-IAE	I. A. E.
-IAEA	I. A. E. A.
-IAEC	I. A. E. C.
-IAFP	I. A. F. P.
-IAM	I. A. M.
-IATA	I. A. T. A.
-IB	I. B.
-IBA	I. B. A.
-IBAA	I. B. A. A.
-IBC	I. B. C.
-IBCA	I. B. C. A.
-IBES	I. B. E. S.
-IBEW	I. B. E. W.
-IBH	I. B. H.
-IBI	I. B. I.
-IBJ	I. B. J.
-IBM	I. B. M.
-IBP	I. B. P.
-IC	I. C.
-ICA	I. C. A.
-ICAO	I. C. A. O.
-ICBM	I. C. B. M.
-ICC	I. C. C.
-ICCO	I. C. C. O.
-ICEE	I. C. E. E.
-ICF	I. C. F.
-ICG	I. C. G.
-ICH	I. C. H.
-ICI	I. C. I.
-ICL	I. C. L.
-ICM	I. C. M.
-ICN	I. C. N.
-ICO	I. C. O.
-ICRP	I. C. R. P.
-ICSL	I. C. S. L.
-ID	I. D.
-IDA	I. D. A.
-IDB	I. D. B.
-IDC	I. D. C.
-IDD	I. D. D.
-IDF	I. D. F.
-IDG	I. D. G.
-IDI	I. D. I.
-IDS	I. D. S.
-IEA	I. E. A.
-IEC	I. E. C.
-IEJW	I. E. J. W.
-IFA	I. F. A.
-IFAR	I. F. A. R.
-IFB	I. F. B.
-IFC	I. F. C.
-IFE	I. F. E.
-IFF	I. F. F.
-IFI	I. F. I.
-IFO	I. F. O.
-IFR	I. F. R.
-IFRB	I. F. R. B.
-IG	I. G.
-IGB	I. G. B.
-IgG	I. g. G.
-IGI	I. G. I.
-IGT	I. G. T.
-IGX	I. G. X.
-IH	I. H.
-IHI	I. H. I.
-IIGS	I. I. G. S.
-IIS	I. I. S.
-IIT	I. I. T.
-IJ	I. J.
-IKEA	I. K. E. A.
-IL	I. L.
-ILA	I. L. A.
-ILC	I. L. C.
-ILGWU	I. L. G. W. U.
-ILO	I. L. O.
-ILS	I. L. S.
-IM	I. M.
-IMA	I. M. A.
-IMC	I. M. C.
-IMD	I. M. D.
-IMF	I. M. F.
-IMG	I. M. G.
-IMI	I. M. I.
-IMM	I. M. M.
-IMO	I. M. O.
-IMS	I. M. S.
-IMT	I. M. T.
-IMU	I. M. U.
-INA	I. N. A.
-INB	I. N. B.
-Inc	Incorporated
-IND	I. N. D.
-INF	I. N. F.
-ING	I. N. G.
-INI	I. N. I.
-INPO	I. N. P. O.
-INR	I. N. R.
-INS	I. N. S.
-Intl	International
-Intercorp	Intercorporation
-IOC	I. O. C.
-IOR	I. O. R.
-IOS	I. O. S.
-IOU	I. O. U.
-IP	I. P.
-IPC	I. P. C.
-IPE	I. P. E.
-IPFA	I. P. F. A.
-IPM	I. P. M.
-IPO	I. P. O.
-IPS	I. P. S.
-IQ	I. Q.
-IRA	I. R. A.
-IRI	I. R. I.
-IRNA	I. R. N. A.
-IROC	I. R. O. C.
-IRS	I. R. S.
-IRT	I. R. T.
-ISC	I. S. C.
-ISDN	I. S. D. N.
-ISE	I. S. E.
-ISI	I. S. I.
-ISL	I. S. L.
-ISM	I. S. M.
-ISO	I. S. O.
-ISS	I. S. S.
-ITA	I. T. A.
-ITC	I. T. C.
-ITG	I. T. G.
-ITN	I. T. N.
-ITT	I. T. T.
-ITV	I. T. V.
-IU	I. U.
-IUD	I. U. D.
-IUE	I. U. E.
-IUR	I. U. R.
-IVF	I. V. F.
-IVI	I. V. I.
-IVIG	I. V. I. G.
-IXL	I. X. L.
-IWA	I. W. A.
-JAL	J. A. L.
-JAMA	J. A. M. A.
-JATP	J. A. T. P.
-JBA	J. B. A.
-JC	J. C.
-JCB	J. C. B.
-JCP	J. C. P.
-JCS	J. C. S.
-JCT	J. C. T.
-JDS	J. D. S.
-JEC	J. E. C.
-JFA	J. F. A.
-JFK	J. F. K.
-JGC	J. G. C.
-JHM	J. H. M.
-JIT	J. I. T.
-JLG	J. L. G.
-JMB	J. M. B.
-JMR	J. M. R.
-JOA	J. O. A.
-JP	J. P.
-JPL	J. P. L.
-JPM	J. P. M.
-JR	J. R.
-JRA	J. R. A.
-JSP	J. S. P.
-JT	J. T.
-JTL	J. T. L.
-JTM	J. T. M.
-JTPA	J. T. P. A.
-JVC	J. V. C.
-JVP	J. V. P.
-JWD	J. W. D.
-JWP	J. W. P.
-JWT	J. W. T.
-KAL	K. A. L.
-KB	K. B.
-KBA	K. B. A.
-KBGS	K. B. G. S.
-KBS	K. B. S.
-KC	K. C.
-KCBS	K. C. B. S.
-KCP	K. C. P.
-KCS	K. C. S.
-KCST	K. C. S. T.
-KD	K. D.
-KDD	K. D. D.
-KDI	K. D. I.
-KETV	K. E. T. V.
-KF	K. F.
-KFC	K. F. C.
-KFF	K. F. F.
-KFW	K. F. W.
-KG	K. G.
-KGaA	K. G. a. A.
-KGB	K. G. B.
-KGF	K. G. F.
-KGMC	K. G. M. C.
-KH	K. H.
-KHD	K. H. D.
-KHJ	K. H. J.
-KIC	K. I. C.
-KIO	K. I. O.
-KK	K. K.
-KKB	K. K. B.
-KKR	K. K. R.
-KLA	K. L. A.
-KLM	K. L. M.
-KLP	K. L. P.
-KLUC	K. L. U. C.
-KMA	K. M. A.
-KMET	K. M. E. T.
-KMG	K. M. G.
-KMS	K. M. S.
-KMT	K. M. T.
-KMW	K. M. W.
-KN	K. N.
-KNON	K. N. O. N.
-KOP	K. O. P.
-KPAX	K. P. A. X.
-KPC	K. P. C.
-KPFK	K. P. F. K.
-KPMG	K. P. M. G.
-KPRC	K. P. R. C.
-KSI	K. S. I.
-KSZ	K. S. Z.
-KTF	K. T. F.
-KTM	K. T. M.
-KTWV	K. T. W. V.
-KV	K. V.
-KVIL	K. V. I. L.
-KW	K. W.
-KWU	K. W. U.
-KZKC	K. Z. K. C.
-LA	L. A.
-LB	L. B.
-LBJ	L. B. J.
-LBO	L. B. O.
-LBS	L. B. S.
-LCA	L. C. A.
-LCD	L. C. D.
-LCG	L. C. G.
-LCI	L. C. I.
-LCP	L. C. P.
-LDC	L. D. C.
-LDDS	L. D. D. S.
-LDI	L. D. I.
-LDL	L. D. L.
-LDP	L. D. P.
-LDS	L. D. S.
-LDX	L. D. X.
-LFB	L. F. B.
-LFC	L. F. C.
-LG	L. G.
-LGP	L. G. P.
-LH	L. H.
-LHS	L. H. S.
-LHX	L. H. X.
-LIC	L. I. C.
-LiFeS	L. i. F. e. S.
-LIG	L. I. G.
-LIN	L. I. N.
-LIPA	L. I. P. A.
-LISC	L. I. S. C.
-LJN	L. J. N.
-LL	L. L.
-LLC	L. L. C.
-LME	L. M. E.
-LMT	L. M. T.
-LN	L. N.
-LNG	L. N. G.
-LNR	L. N. R.
-LNS	L. N. S.
-LOF	L. O. F.
-LOR	L. O. R.
-LOT	L. O. T.
-LP	L. P.
-LPC	L. P. C.
-LPGA	L. P. G. A.
-LPL	L. P. L.
-LPP	L. P. P.
-LS	L. S.
-LSB	L. S. B.
-LSC	L. S. C.
-LSD	L. S. D.
-LSI	L. S. I.
-LSU	L. S. U.
-LT	L. T.
-LTCB	L. T. C. B.
-LTD	L. T. D.
-LTV	L. T. V.
-LTX	L. T. X.
-LVI	L. V. I.
-LVMH	L. V. M. H.
-LX	L. X.
-LY	L. Y.
-MAI	M. A. I.
-MB	M. B.
-MBA	M. B. A.
-MBAA	M. B. A. A.
-MBB	M. B. B.
-MBE	M. B. E.
-MBF	M. B. F.
-MBFR	M. B. F. R.
-MBH	M. B. H.
-MBI	M. B. I.
-MBIA	M. B. I. A.
-MBS	M. B. S.
-MC	M. C.
-MCA	M. C. A.
-MCC	M. C. C.
-MCCP	M. C. C. P.
-MCEG	M. C. E. G.
-MCI	M. C. I.
-MCM	M. C. M.
-MCN	M. C. N.
-MCO	M. C. O.
-MCP	M. C. P.
-MCS	M. C. S.
-MD	M. D.
-MDA	M. D. A.
-MDB	M. D. B.
-MDC	M. D. C.
-MDI	M. D. I.
-MDM	M. D. M.
-MDT	M. D. T.
-MEBA	M. E. B. A.
-MEI	M. E. I.
-MEK	M. E. K.
-MEM	M. E. M.
-MEPC	M. E. P. C.
-MFA	M. F. A.
-MFI	M. F. I.
-MFL	M. F. L.
-MFN	M. F. N.
-MFS	M. F. S.
-MGC	M. G. C.
-MGI	M. G. I.
-MGM	M. G. M.
-MH	M. H.
-MHA	M. H. A.
-MHC	M. H. C.
-MHI	M. H. I.
-MHP	M. H. P.
-MHQ	M. H. Q.
-MI	M. I.
-MIA	M. I. A.
-MICC	M. I. C. C.
-MIGA	M. I. G. A.
-MIM	M. I. M.
-MIP	M. I. P.
-MIPS	M. I. P. S.
-MIS	M. I. S.
-MIT	M. I. T.
-MITI	M. I. T. I.
-MK	M. K.
-MKI	M. K. I.
-ML	M. L.
-MLP	M. L. P.
-MLPI	M. L. P. I.
-MLS	M. L. S.
-MLX	M. L. X.
-MMAC	M. M. A. C.
-MMC	M. M. C.
-MMI	M. M. I.
-MMPI	M. M. P. I.
-MMR	M. M. R.
-MMS	M. M. S.
-MMWEC	M. M. W. E. C.
-MNC	M. N. C.
-MNet	M. Net
-MNX	M. N. X.
-MP	M. P.
-MPAA	M. P. A. A.
-MPB	M. P. B.
-MPLA	M. P. L. A.
-MPS	M. P. S.
-MPT	M. P. T.
-MPTP	M. P. T. P.
-MPV	M. P. V.
-MRC	M. R. C.
-MRCA	M. R. C. A.
-MRI	M. R. I.
-MRP	M. R. P.
-MRTA	M. R. T. A.
-MS	M. S.
-MSA	M. S. A.
-MSHA	M. S. H. A.
-MSI	M. S. I.
-MSL	M. S. L.
-MSM	M. S. M.
-MSOE	M. S. O. E.
-MSP	M. S. P.
-MSRB	M. S. R. B.
-MSU	M. S. U.
-MSX	M. S. X.
-MTA	M. T. A.
-MTB	M. T. B.
-MTBE	M. T. B. E.
-MTech	M. Tech
-MTI	M. T. I.
-MTM	M. T. M.
-MTR	M. T. R.
-MTS	M. T. S.
-MTU	M. T. U.
-MTV	M. T. V.
-MV	M. V.
-MVP	M. V. P.
-MVS	M. V. S.
-MX	M. X.
-NA	N. A.
-NAACP	N. Double A. C. P.
-NAC	N. A. C.
-NACA	N. A. C. A.
-NACM	N. A. C. M.
-NAD	N. A. D.
-NAEIR	N. A. E. I. R.
-NAEP	N. A. E. P.
-NAHB	N. A. H. B.
-NAIC	N. A. I. C.
-NAL	N. A. L.
-NALU	N. A. L. U.
-NAM	N. A. M.
-NAPAP	N. A. P. A. P.
-NAPM	N. A. P. M.
-NAR	N. A. R.
-NARFE	N. A. R. F. E.
-NAS	N. A. S.
-#NASA	N. A. S. A.
-NASD	N. A. S. D.
-NASSA	N. A. S. S. A.
-NATCA	N. A. T. C. A.
-NAV	N. A. V.
-NBA	N. B. A.
-NBC	N. B. C.
-NBD	N. B. D.
-NBER	N. B. E. R.
-NBI	N. B. I.
-NBO	N. B. O.
-NBS	N. B. S.
-NC	N. C.
-NCA	N. C. A.
-NCAA	N. C. A. A.
-NCB	N. C. B.
-NCC	N. C. C.
-NCI	N. C. I.
-NCIF	N. C. I. F.
-NCMS	N. C. M. S.
-NCNB	N. C. N. B.
-NCR	N. C. R.
-NCTA	N. C. T. A.
-NDF	N. D. F.
-NDI	N. D. I.
-NDP	N. D. P.
-NEA	N. E. A.
-NEC	N. E. C.
-NEH	N. E. H.
-NEI	N. E. I.
-NESB	N. E. S. B.
-NETAAC	N. E. T. A. A. C.
-NFA	N. F. A.
-NFC	N. F. C.
-NFIB	N. F. I. B.
-NFIC	N. F. I. C.
-NFL	N. F. L.
-NFPA	N. F. P. A.
-NFS	N. F. S.
-NFSW	N. F. S. W.
-NGL	N. G. L.
-NH	N. H.
-NHK	N. H. K.
-NHL	N. H. L.
-NHS	N. H. S.
-NHTSA	N. H. T. S. A.
-NI	N. I.
-NIA	N. I. A.
-NIC	N. I. C.
-NIDA	N. I. D. A.
-NIH	N. I. H.
-NIMH	N. I. M. H.
-NIOSH	N. I. O. S. H.
-NIS	N. I. S.
-NJ	N. J.
-NKF	N. K. F.
-NKK	N. K. K.
-NKVD	N. K. V. D.
-NL	N. L.
-NLD	N. L. D.
-NLI	N. L. I.
-NLM	N. L. M.
-NLO	N. L. O.
-NLRB	N. L. R. B.
-NM	N. M.
-NME	N. M. E.
-NMP	N. M. P.
-NMS	N. M. S.
-NMTBA	N. M. T. B. A.
-NMU	N. M. U.
-NOAA	N. O. A. A.
-NOX	N. O. X.
-NPA	N. P. A.
-NPC	N. P. C.
-NPD	N. P. D.
-NPM	N. P. M.
-NRA	N. R. A.
-NRC	N. R. C.
-NRDC	N. R. D. C.
-NRECA	N. R. E. C. A.
-NRM	N. R. M.
-NS	N. S.
-NSA	N. S. A.
-NSC	N. S. C.
-NSF	N. S. F.
-NSM	N. S. M.
-NSPA	N. S. P. A.
-NT	N. T.
-NTC	N. T. C.
-NTG	N. T. G.
-NTIA	N. T. I. A.
-NTN	N. T. N.
-NTSB	N. T. S. B.
-NTT	N. T. T.
-NTX	N. T. X.
-NUI	N. U. I.
-NUM	N. U. M.
-NUS	N. U. S.
-NV	N. V.
-NVF	N. V. F.
-NW	N. W.
-NWA	N. W. A.
-NWQ	N. W. Q.
-NX	N. X.
-NY	N. Y.
-NYCB	N. Y. C. B.
-NYCE	N. Y. C. E.
-NYFE	N. Y. F. E.
-NYSE	N. Y. S. E.
-NYT	N. Y. T.
-NYU	N. Y. U.
-NZI	N. Z. I.
-OAG	O. A. G.
-OAS	O. A. S.
-OASDI	O. A. S. D. I.
-OAT	O. A. T.
-OCC	O. C. C.
-OCE	O. C. E.
-OCR	O. C. R.
-OCS	O. C. S.
-OCU	O. C. U.
-ODS	O. D. S.
-OEC	O. E. C.
-OECD	O. E. C. D.
-OED	O. E. D.
-OEL	O. E. L.
-OEM	O. E. M.
-OEX	O. E. X.
-OG	O. G.
-OIRA	O. I. R. A.
-OIS	O. I. S.
-OK	O. K.
-OKC	O. K. C.
-OMB	O. M. B.
-OMI	O. M. I.
-OMV	O. M. V.
-ONG	O. N. G.
-OPIC	O. P. I. C.
-OPM	O. P. M.
-ORI	O. R. I.
-ORS	O. R. S.
-OS	O. S.
-OSF	O. S. F.
-OSI	O. S. I.
-OSS	O. S. S.
-OTA	O. T. A.
-OTC	O. T. C.
-OTF	O. T. F.
-OTN	O. T. N.
-OTS	O. T. S.
-OTV	O. T. V.
-OV	O. V.
-PA	P. A.
-PAE	P. A. E.
-PAK	P. A. K.
-PATC	P. A. T. C.
-PB	P. B.
-PBA	P. B. A.
-PBGC	P. B. G. C.
-PBHG	P. B. H. G.
-PBI	P. B. I.
-PBR	P. B. R.
-PBS	P. B. S.
-PBX	P. B. X.
-PC	P. C.
-PCA	P. C. A.
-PCB	P. C. B.
-PCC	P. C. C.
-PCE	P. C. E.
-PCI	P. C. I.
-PCjr	P. C. Junior
-PCL	P. C. L.
-PCM	P. C. M.
-PCMCIA	P. C. M. C. I. A.
-PCN	P. C. N.
-PCP	P. C. P.
-PCR	P. C. R.
-PCS	P. C. S.
-PCW	P. C. W.
-PD	P. D.
-PDA	P. D. A.
-PDF	P. D. F.
-PDI	P. D. I.
-PDLA	P. D. L. A.
-PDR	P. D. R.
-PDT	P. D. T.
-PE	P. E.
-PECC	P. E. C. C.
-PF	P. F.
-PFM	P. F. M.
-PG	P. G.
-PGA	P. G. A.
-PGH	P. G. H.
-PhD	P. H. D.
-Ph.D	P. H. D.
-Ph.D.s	P. H. D.s
-Ph.Ds	P. H. D.s
-PhDs	P. H. D.s
-PHH	P. H. H.
-PHLCorp	P. H. L. Corporation
-PHM	P. H. M.
-PHP	P. H. P.
-PHPO	P. H. P. O.
-PI	P. I.
-PIK	P. I. K.
-PIP	P. I. P.
-PIR	P. I. R.
-PIW	P. I. W.
-PL	P. L.
-PLC	P. L. C.
-PLE	P. L. E.
-PLM	P. L. M.
-PLO	P. L. O.
-PM	P. M.
-PMA	P. M. A.
-PMC	P. M. C.
-PMDB	P. M. D. B.
-PMI	P. M. I.
-PMS	P. M. S.
-PMT	P. M. T.
-PNB	P. N. B.
-PNC	P. N. C.
-PNG	P. N. G.
-PNM	P. N. M.
-PNOC	P. N. O. C.
-POW	P. O. W.
-PP	P. P.
-PPD	P. P. D.
-PPG	P. P. G.
-PPI	P. P. I.
-PPM	P. P. M.
-PPO	P. P. O.
-PPP	P. P. P.
-PQQ	P. Q. Q.
-PR	P. R.
-PRB	P. R. B.
-PRC	P. R. C.
-PRD	P. R. D.
-PRI	P. R. I.
-PRSA	P. R. S. A.
-Pvt	Private
-PRK	P. R. K.
-PRP	P. R. P.
-PS	P. S.
-PSA	P. S. A.
-PSC	P. S. C.
-PSE	P. S. E.
-PSG	P. S. G.
-PSI	P. S. I.
-PSNH	P. S. N. H.
-PSR	P. S. R.
-PST	P. S. T.
-PSUM	P. S. U. M.
-PT	P. T.
-PTA	P. T. A.
-PTI	P. T. I.
-PTL	P. T. L.
-PTT	P. T. T.
-PUC	P. U. C.
-PV	P. V.
-PVC	P. V. C.
-PW	P. W.
-PWA	P. W. A.
-PWS	P. W. S.
-PX	P. X.
-PYA	P. Y. A.
-QB	Q. B.
-QDE	Q. D. E.
-QE	Q. E.
-QFB	Q. F. B.
-QMS	Q. M. S.
-QO	Q. O.
-QVC	Q. V. C.
-RAC	R. A. C.
-RAF	R. A. F.
-RAI	R. A. I.
-RB	R. B.
-RBC	R. B. C.
-RC	R. C.
-RCA	R. C. A.
-RCI	R. C. I.
-RCM	R. C. M.
-RD	R. D.
-RDF	R. D. F.
-RDP	R. D. P.
-REIT	R. E. I. T.
-RF	R. F.
-RFC	R. F. C.
-RFD	R. F. D.
-RFE	R. F. E.
-RFI	R. F. I.
-RFTV	R. F. T. V.
-RG	R. G.
-RHI	R. H. I.
-RHM	R. H. M.
-RI	R. I.
-RJ	R. J.
-RJR	R. J. R.
-RKO	R. K. O.
-RL	R. L.
-RLC	R. L. C.
-RLI	R. L. I.
-RLR	R. L. R.
-RMC	R. M. C.
-RMI	R. M. I.
-RMJ	R. M. J.
-RMS	R. M. S.
-RMV	R. M. V.
-RNA	R. N. A.
-RNC	R. N. C.
-RO	R. O.
-ROA	R. O. A.
-ROC	R. O. C.
-ROTC	R. O. T. C.
-RPA	R. P. A.
-RPM	R. P. M.
-RREEF	R. R. E. E. F.
-RS	R. S.
-RSC	R. S. C.
-RSCG	R. S. C. G.
-RSI	R. S. I.
-RSO	R. S. O.
-RSV	R. S. V.
-RT	R. T.
-RTBF	R. T. B. F.
-RTC	R. T. C.
-RTE	R. T. E.
-RTHK	R. T. H. K.
-RTL	R. T. L.
-RTM	R. T. M.
-RTS	R. T. S.
-RTZ	R. T. Z.
-RU	R. U.
-RUC	R. U. C.
-RV	R. V.
-RWE	R. W. E.
-RX	R. X.
-SA	S. A.
-SAA	S. A. A.
-SAB	S. A. B.
-SACC	S. A. C. C.
-SACP	S. A. C. P.
-SAI	S. A. I.
-SAL	S. A. L.
-SALP	S. A. L. P.
-SAO	S. A. O.
-SAPC	S. A. P. C.
-SAS	S. A. S.
-SAT	S. A. T.
-SB	S. B.
-SBA	S. B. A.
-SBC	S. B. C.
-SBCI	S. B. C. I.
-SBIC	S. B. I. C.
-SBIR	S. B. I. R.
-SBK	S. B. K.
-SBS	S. B. S.
-SC	S. C.
-SCA	S. C. A.
-SCE	S. C. E.
-SCEcorp	S. C. E. Corporation
-SCI	S. C. I.
-SCM	S. C. M.
-SD	S. D.
-SDA	S. D. A.
-SDC	S. D. C.
-SDG	S. D. G.
-SDI	S. D. I.
-SDP	S. D. P.
-SDR	S. D. R.
-SDRC	S. D. R. C.
-SDS	S. D. S.
-SE	S. E.
-SEC	S. E. C.
-SEEQ	S. E. E. Q.
-SEI	S. E. I.
-SEL	S. E. L.
-SEM	S. E. M.
-SES	S. E. S.
-SF	S. F.
-SFC	S. F. C.
-SFE	S. F. E.
-SFN	S. F. N.
-SFO	S. F. O.
-SGB	S. G. B.
-SGC	S. G. C.
-SGI	S. G. I.
-SGS	S. G. S.
-SH	S. H.
-SHL	S. H. L.
-SHV	S. H. V.
-SI	S. I.
-SIA	S. I. A.
-SIB	S. I. B.
-SIBV	S. I. B. V.
-SIPC	S. I. P. C.
-SIV	S. I. V.
-SJNB	S. J. N. B.
-SK	S. K.
-SKF	S. K. F.
-SKK	S. K. K.
-SL	S. L.
-SLA	S. L. A.
-SLH	S. L. H.
-SLM	S. L. M.
-SLR	S. L. R.
-SMC	S. M. C.
-SME	S. M. E.
-SMES	S. M. E. S.
-SMR	S. M. R.
-SMS	S. M. S.
-SMU	S. M. U.
-SMUD	S. M. U. D.
-SNC	S. N. C.
-SNCF	S. N. C. F.
-SNET	S. N. E. T.
-SNIA	S. N. I. A.
-SNL	S. N. L.
-SNPE	S. N. P. E.
-SOES	S. O. E. S.
-SOS	S. O. S.
-SP	S. P.
-SPD	S. P. D.
-SPE	S. P. E.
-SPEP	S. Pep
-SPG	S. P. G.
-SPI	S. P. I.
-SPS	S. P. S.
-SPSF	S. P. S. F.
-SPX	S. P. X.
-SpA	Company
-S.p.A	Company
-SQL	S. Q. L.
-SR	S. R.
-SRI	S. R. I.
-SRK	S. R. K.
-SRL	S. R. L.
-SRO	S. R. O.
-SRS	S. R. S.
-SS	S. S.
-SSA	S. S. A.
-SSB	S. S. B.
-SSBI	S. S. B. I.
-SSC	S. S. C.
-SSI	S. S. I.
-SSMC	S. S. M. C.
-SSN	S. S. N.
-SSP	S. S. P.
-SST	S. S. T.
-STC	S. T. C.
-Ste	Saint
-STS	S. T. S.
-SVP	S. V. P.
-SX	S. X.
-TA	T. A.
-TB	T. B.
-TBA	T. B. A.
-TBC	T. B. C.
-TBF	T. B. F.
-TBG	T. B. G.
-TBK	T. B. K.
-TBN	T. B. N.
-TBS	T. B. S.
-TBWA	T. B. W. A.
-TC	T. C.
-TCA	T. C. A.
-TCBY	T. C. B. Y.
-TCC	T. C. C.
-TCF	T. C. F.
-TCI	T. C. I.
-TCMP	T. C. M. P.
-TCP	T. C. P.
-TCS	T. C. S.
-TCU	T. C. U.
-TCW	T. C. W.
-TD	T. D.
-TDD	T. D. D.
-TDK	T. D. K.
-TDU	T. D. U.
-TE	T. E.
-TEC	T. E. C.
-TEP	T. E. P.
-TF	T. F.
-TFBA	T. F. B. A.
-TFD	T. F. D.
-TFF	T. F. F.
-TFR	T. F. R.
-TGI	T. G. I.
-TGL	T. G. L.
-TGWU	T. G. W. U.
-THA	T. H. A.
-THI	T. H. I.
-THT	T. H. T.
-TI	T. I.
-TIAA	T. I. A. A.
-TII	T. I. I.
-TIL	T. I. L.
-TIMI	T. I. M. I.
-TJ	T. J.
-TJX	T. J. X.
-TKR	T. K. R.
-TLC	T. L. C.
-TM	T. M.
-TMC	T. M. C.
-TMI	T. M. I.
-TMIC	T. M. I. C.
-TMK	T. M. K.
-TMOC	T. M. O. C.
-TNA	T. N. A.
-TNF	T. N. F.
-TNM	T. N. M.
-TNP	T. N. P.
-TNT	T. N. T.
-TOA	T. O. A.
-TPA	T. P. A.
-tPA	t. P. A.
-TPF	T. P. F.
-TPI	T. P. I.
-TPS	T. P. S.
-TR	T. R.
-TRC	T. R. C.
-TRE	T. R. E.
-TRO	T. R. O.
-TRS	T. R. S.
-TRT	T. R. T.
-TRW	T. R. W.
-TS	T. S.
-TSA	T. S. A.
-TSB	T. S. B.
-TSE	T. S. E.
-TSF	T. S. F.
-TSI	T. S. I.
-TSO	T. S. O.
-TSSU	T. S. S. U.
-TTAC	T. T. A. C.
-TTAPS	T. T. A. P. S.
-TU	T. U.
-TV	T. V.
-TVA	T. V. A.
-TVI	T. V. I.
-TVS	T. V. S.
-TVSM	T. V. S. M.
-TVX	T. V. X.
-TW	T. W.
-TWA	T. W. A.
-TX	T. X.
-TXI	T. X. I.
-TXL	T. X. L.
-TXO	T. X. O.
-UA	U. A.
-UAE	U. A. E.
-UAL	U. A. L.
-UAP	U. A. P.
-UAW	U. A. W.
-UBAF	U. B. A. F.
-UBS	U. B. S.
-UC	U. C.
-UCLA	U. C. L. A.
-UCLAF	U. C. L. A. F.
-UCSD	U. C. S. D.
-UCSF	U. C. S. F.
-UD	U. D.
-UDAG	U. D. A. G.
-UDC	U. D. C.
-UDF	U. D. F.
-UEI	U. E. I.
-UFO	U. F. O.
-UFT	U. F. T.
-UFW	U. F. W.
-UGI	U. G. I.
-UH	U. H.
-UHF	U. H. F.
-UHL	U. H. L.
-UI	U. I.
-UIC	U. I. C.
-UIS	U. I. S.
-UJA	U. J. A.
-UK	U. K.
-UKI	U. K. I.
-ULI	U. L. I.
-UMBC	U. M. B. C.
-UMC	U. M. C.
-UMNO	U. M. N. O.
-UMTA	U. M. T. A.
-UMW	U. M. W.
-UNAM	U. N. A. M.
-UNC	U. N. C.
-UNCF	U. N. C. F.
-UNDP	U. N. D. P.
-UNHCR	U. N. H. C. R.
-UNLV	U. N. L. V.
-UNR	U. N. R.
-UOP	U. O. P.
-UPC	U. P. C.
-UPI	U. P. I.
-UPS	U. P. S.
-URS	U. R. S.
-URW	U. R. W.
-US	U. S.
-USA	U. S. A.
-U.S.A	U. S. A.
-USAA	U. S. A. A.
-USACafes	U. S. A. Cafes
-USADirect	U. S. A. Direct
-USAir	U. S. Air
-USC	U. S. C.
-USCB	U. S. C. B.
-USDA	U. S. D. A.
-USF	U. S. F.
-USFL	U. S. F. L.
-USG	U. S. G.
-USH	U. S. H.
-USI	U. S. I.
-USIA	U. S. I. A.
-USLico	U. S. Lico
-USLife	U. S. Life
-USO	U. S. O.
-USOC	U. S. O. C.
-Uspci	U. S. P. C. I.
-USPS	U. S. P. S.
-USS	U. S. S.
-USSC	U. S. S. C.
-USSR	U. S. S. R.
-UST	U. S. T.
-USW	U. S. W.
-USX	U. S. X.
-UT	U. T.
-UTA	U. T. A.
-UTC	U. T. C.
-UTL	U. T. L.
-UTU	U. T. U.
-UV	U. V.
-UX	U. X.
-VA	V. A.
-VAAP	V. A. A. P.
-VAD	V. A. D.
-VAN	V. A. N.
-VBI	V. B. I.
-VC	V. C.
-VCI	V. C. I.
-VCR	V. C. R.
-VCRS	V. C. R. S.
-VCS	V. C. S.
-VD	V. D.
-VDT	V. D. T.
-VF	V. F.
-VFW	V. F. W.
-VG	V. G.
-VGA	V. G. A.
-VH	V. H.
-VHA	V. H. A.
-VHF	V. H. F.
-VHL	V. H. L.
-VHS	V. H. S.
-VIP	V. I. P.
-VIR	V. I. R.
-VISN	V. I. S. N.
-VJN	V. J. N.
-VLI	V. L. I.
-VLSI	V. L. S. I.
-VM	V. M.
-VMS	V. M. S.
-VMX	V. M. X.
-VNA	V. N. A.
-VNR	V. N. R.
-VNU	V. N. U.
-VO	V. O.
-VOA	V. O. A.
-VOR	V. O. R.
-VP	V. P.
-VPI	V. P. I.
-VPT	V. P. T.
-VQT	V. Q. T.
-VR	V. R.
-VRA	V. R. A.
-VS	V. S.
-VSAT	V. S. A. T.
-VSB	V. S. B.
-VTC	V. T. C.
-VTR	V. T. R.
-VTX	V. T. X.
-VW	V. W.
-VWR	V. W. R.
-WABC	W. A. B. C.
-WAFA	W. A. F. A.
-WASP	W. A. S. P.
-WATS	W. A. T. S.
-WB	W. B.
-WBA	W. B. A.
-WBAI	W. B. A. I.
-WBBM	W. B. B. M.
-WBZ	W. B. Z.
-WCBS	W. C. B. S.
-WCI	W. C. I.
-WCIX	W. C. I. X.
-WCK	W. C. K.
-WCRS	W. C. R. S.
-WCVB	W. C. V. B.
-WD	W. D.
-WDB	W. D. B.
-WEFA	W. E. F. A.
-WEG	W. E. G.
-WestLB	West L. B.
-WEU	W. E. U.
-WFAN	W. F. A. N.
-WFBQ	W. F. B. Q.
-WFC	W. F. C.
-WFIA	W. F. I. A.
-WFLA	W. F. L. A.
-WFRR	W. F. R. R.
-WFXT	W. F. X. T.
-WGA	W. G. A.
-WGBH	W. G. B. H.
-WGC	W. G. C.
-WGHP	W. G. H. P.
-WGMS	W. G. M. S.
-WGN	W. G. N.
-WHAS	W. H. A. S.
-WHBQ	W. H. B. Q.
-WIC	W. I. C.
-WITI	W. I. T. I.
-WJBK	W. J. B. K.
-WJW	W. J. W.
-WKYS	W. K. Y. S.
-WLR	W. L. R.
-WM	W. M.
-WMAQ	W. M. A. Q.
-WMG	W. M. G.
-WMMS	W. M. M. S.
-WMS	W. M. S.
-WNAC	W. N. A. C.
-WNBC	W. N. B. C.
-WNCN	W. N. C. N.
-WNET	W. N. E. T.
-WNEW	W. N. E. W.
-WNS	W. N. S.
-WNW	W. N. W.
-WNYC	W. N. Y. C.
-WNYW	W. N. Y. W.
-WOJB	W. O. J. B.
-WOMC	W. O. M. C.
-WOR	W. O. R.
-WPA	W. P. A.
-WPBF	W. P. B. F.
-WPGH	W. P. G. H.
-WPIX	W. P. I. X.
-WPP	W. P. P.
-WPPSS	W. P. P. S. S.
-WQHT	W. Q. H. T.
-WQTV	W. Q. T. V.
-WQUE	W. Q. U. E.
-WR	W. R.
-WRKO	W. R. K. O.
-WROR	W. R. O. R.
-WSBK	W. S. B. K.
-WSCV	W. S. C. V.
-WSGP	W. S. G. P.
-WSJ	W. S. J.
-WSVN	W. S. V. N.
-WTBS	W. T. B. S.
-WTC	W. T. C.
-WTD	W. T. D.
-WTI	W. T. I.
-WTLV	W. T. L. V.
-WTO	W. T. O.
-WTTV	W. T. T. V.
-WTVJ	W. T. V. J.
-WTVT	W. T. V. T.
-WTXF	W. T. X. F.
-WW	W. W.
-WWII	W. W. I. I.
-WWL	W. W. L.
-WWOR	W. W. O. R.
-WXRK	W. X. R. K.
-WYLD	W. Y. L. D.
-WYNY	W. Y. N. Y.
-WZTV	W. Z. T. V.
-XA	X. A.
-XE	X. E.
-XJ	X. J.
-XL	X. L.
-XMP	X. M. P.
-XP	X. P.
-XR	X. R.
-XT	X. T.
-XTC	X. T. C.
-XYZ	X. Y. Z.
-YMCA	Y. M. C. A.
-YSL	Y. S. L.
-YTT	Y. T. T.
-YWCA	Y. W. C. A.
-ZCB	Z. C. B.
-ZDF	Z. D. F.
-ZMI	Z. M. I.
-ZR	Z. R.
-ZTS	Z. T. S.
-ZX	Z. X.
-mm	m. m.
-mg	m. g.
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrproc.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
deleted file mode 100755
index f22684c5742..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/abbrproc.perl
+++ /dev/null
@@ -1,465 +0,0 @@
-#!/usr/bin/perl
-# $Id: abbrproc.perl,v 1.3 1996/08/21 20:05:09 robertm Rel $
-###############################################################################
-# This software is being provided to you, the LICENSEE, by the Massachusetts  #
-# Institute of Technology (M.I.T.) under the following license.  By           #
-# obtaining, using and/or copying this software, you agree that you have      #
-# read, understood, and will comply with these terms and conditions:          #
-#                                                                             #
-# Permission to use, copy, modify and distribute, including the right to      #
-# grant others the right to distribute at any tier, this software and its     #
-# documentation for any purpose and without fee or royalty is hereby granted, #
-# provided that you agree to comply with the following copyright notice and   #
-# statements, including the disclaimer, and that the same appear on ALL       #
-# copies of the software and documentation, including modifications that you  #
-# make for internal use or for distribution:                                  #
-#                                                                             #
-# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
-# reserved.                                                                   #
-#                                                                             #
-# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
-# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
-# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
-# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
-# TRADEMARKS OR OTHER RIGHTS.                                                 #
-#                                                                             #
-# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
-# used in advertising or publicity pertaining to distribution of the          #
-# software.  Title to copyright in this software and any associated           #
-# documentation shall at all times remain with M.I.T., and USER agrees to     #
-# preserve same.                                                              #
-###############################################################################
-
-# abbreviation preprocessor for WSJ
-# assumes 1 sentence per line
-#
-# 1. map "x.y." -> "x. y."
-# 2. convert Roman numerals with appropriate left context into cardinal no.s
-# 3. expand abbreviations and word translations
-#	expands remaining Roman numerals into ordinal no.s
-# 4. map isolated letters: "x" -> "x."
-
-# Minor modifications by David Graff, Linguistic Data Consortium, in
-# preparation for publishing on cdrom;  Aug. 11, 1994.
-
-# Major modifications by Robert MacIntyre, LDC, attempting to improve
-# performance (~50% speedup), in preparation of Broadcast News material,
-# August 1996.
-
-
-$file="$ENV{HOME}/bc-news/bin/abbrlist";		# default abbreviation file
-
-for($i=0,$j=0;$i<=$#ARGV;$i++)
-{	if($ARGV[$i] =~ /^-/)
-	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
-		else {&perr("illegal flag: $ARGV[$i]");}
-	}
-	else
-	{ #	if($file) {&perr("multiple file arg");}
-		$file=$ARGV[i];
-	}
-}
-@ARGV=();
-if(!file) {&perr("no abbreviation file specified"); }
-
-if(!open(FILE,$file)) {&perr("cannot open abbreviation file"); }
-while(<FILE>)
-{	if(/^#/) {next;}	# comment
-	s/\n//;
-	if(!$_) {next;}		# blank
-	$y=$_;
-	s/^(\S+)\s+//;		# extract 1st word
-	$x=$1;
-	if(!$x) {&perr("no word: $y");}
-	if(!$_) {&perr("no value: $y");}
-
-	if($x =~ /^\*r/)		# left context for roman numeral
-	{	if(!/^[a-zA-Z]{2,}$/)
-			{&perr("illegal roman: $x");}
-		tr/a-z/A-Z/;		# map to UC
-		$romanlc{$_}=1;
-	}
-	elsif($x =~ /\.$/)			# abbreviations
-	{	if($x !~ /^[a-zA-Z][a-zA-Z\.]+\.$/)
-			{&perr("illegal abbreviation: $x");}
-		$x =~ s/\.$//;
-		$abbrev{$x}=$_;
-		if($x =~ /[a-z]/)
-		{	$x =~ tr/a-z/A-Z/;	#UC version
-			tr/a-z/A-Z/;
-			$abbrev{$x}=$_;
-		}
-		#if(length($x)>$maxabl) {$maxabl=length($x);}
-	}
-	else				# translations
-	{	if($x !~ /^[a-zA-Z\.&\/-]+[a-zA-Z]$/)
-			{&perr("illegal translation: $x");}
-		$trans{$x}=$_;
-		if($x =~ /[a-z]/)
-		{	$x =~ tr/a-z/A-Z/;	#UC version
-			tr/a-z/A-Z/;
-			$trans{$x}=$_;
-		}
-		#if(length($x)>$maxtrl) {$maxtrl=length($x);}
-	}
-	$n++;
-}
-#if($vflg) {print STDERR "$n lines read from file\n";}
-
-&setupRoman;
-
-while(<>)
-{ ###########################  abbrevproc ####################################
-
-    # pass SGML as is
-    if (/^<\/?[spa]/)
-    {
-	print;
-	next;
-    }
-    chop;
-
-
-    s/&/ & /g;			# &
-    s=/= / =g;			# /
-    s/ - / -- /g;		# save (long) dashes
-    s/\b(-+)\b/ $1 /g;		# -, --, etc. in words
-    s/([^-\s])(-+)([^-\s])/$1 $2 $3/g;
-
-    if(/_/)
-    {
-	&perr2("removing illegal underscores (_) in:\n $_\n");
-	s/_//g;
-    }
-
-    @input = split(/\s+/);
-    @output=();
-    for($field=0;$field<=$#input;$field++)
-    {
-	$_ = $input[$field];
-	# if($vflg) {print "in: $_\n";}
-
-	s/^(\W*)//;		# strip front
-	$front=$1;
-
-	s/(\W*)$//;		# strip back
-	$back=$1;
-	if(/\.?\'[sS]$/)		# possessive
-	{
-	    s/(\.?\'[sS])$//;
-	    $back="$1$back";
-	}
-	elsif (/^[A-Z]+s$/)	# eg Cs or Xs
-	{
-	    s/s$//;
-	    $back="_s$back";
-	}
-
-	$ptbkflg = ($back =~ /^\./);
-
-	#if($vflg) {print "f=$front, m=$_, b=$back\n";}
-
-
-	# Roman numerals
-	if(/^[IVX]{1,6}$/ && $front eq "" && $field>0 &&
-	   ($x=&geto()))
-	{
-	    $x =~ tr/a-z/A-Z/;	# map lc to UC
-	    $x =~ s/^\W//;	   # strip initial punct from lc
-	    if($romanlc{$x})	# left context check
-	    {
-		if($front) 
-		{
-		    &pusho($front);
-		    if($front !~ /[\w]$/) {$appendflg=1;}
-		}
-
-		if ($x=$Roman{$_})
-		{
-		    &pusho($x);
-		}
-		else
-		{
-		    &perr2("illegal roman: $_");
-		    &pusho($_);
-		}
-
-		if($back)
-		{
-		    if($back !~ /^[\w]/) {&appendo($back);}
-		    else {&pusho($back);}
-		}
-		next;
-	    }
-				
-	}
-
-
-	# St. or St ["Street" vs. "Saint"]
-			if($_ eq "St")
-			{	$back =~ s/^\.//;
-				if($front ne "" && $back ne "")
-				{	&perr2("Cannot resove St.: $input[$field-1] $input[$field] $input[$field+1]");
-					$x=Street;	# Wild guess
-				}
-				elsif($front) { $x="Saint"; }
-				elsif($back) { $x="Street"; }
-				elsif($input[$field-1] !~ /^[A-Z]/
-					&& $input[$field+1] =~ /^[A-Z]/)
-					{ $x = "Saint"; }
-				elsif($input[$field-1] =~ /^[A-Z]/
-					&& $input[$field+1] !~ /^[A-Z]/)
-					{ $x = "Street"; }
-
-				elsif(!$back && $input[$field+1] =~ /^[A-Z]/)
-					{ $x = "Saint"; }
-				elsif(!$back && $input[$field+1] eq '-' &&
-					$input[$field+2] =~ /^[A-Z]/)
-					{ $x = "Saint"; }
-				else
-				{	&perr2("Cannot resove St.: $input[$field-1] $input[$field] $input[$field+1]");
-					$x=Street;	# Wild guess
-				}
-
-
-				if($front) 
-				{	&pusho($front);
-					if($front !~ /[\w]$/) {$appendflg=1;}
-				}
-	
-				&pusho($x);
-
-				if($back)
-				{	if($back !~ /^[\w]/) {&appendo($back);}
-					else {&pusho($back);}
-				}
-				next;
-			}
-
-	# abbreviations (end with .)
-			if($ptbkflg && ($x=$abbrev{$_}))
-			{	
-					if($front) 
-					{	&pusho($front);
-						if($front !~ /[\w]$/)
-							{$appendflg=1;}
-					}
-	
-					&pusho($x);
-					
-					if($field<$#input || $back =~ /[!?]/)
-						{ $back =~ s/^\.//; }	# rm .
-					else			# end of sent
-					{	$back =~ s/^\.(\'s)/$1./;
-						if($back =~ /\..*\./) # 2 dots
-						      {$back=~s/\.([^\.]*)/$1/;}
-					}
-
-					if($back)
-					{	if($back !~ /^[\w]/)
-							{&appendo($back);}
-						else {&pusho($back);}
-					}
-					next;
-				
-			}
-
-	# translations (do not end with .)
-			# first merge multi-token translations
-			if($input[$field+1] =~ /^[-\/&]$/ && $back eq "")
-			{	$x=$input[$field+2];
-				$x =~ s/(\W*)$//;
-				$xback=$1;
-				if($x =~ /\.?\'[sS]$/)		# possessive
-				{	$x =~ s/(\.?\'[sS])$//;
-					$xback="$1$xback";
-				}
-				elsif ($x =~ /^[A-Z]+s$/)	# eg Cs or Xs
-				{	$x =~ s/s$//;
-					$xback="_s$xback";
-				}
-				if($trans{"$_$input[$field+1]$x"})   # eg. AT&T
-				{	$_="$_$input[$field+1]$x";
-					$field+=2;
-
-					$back=$xback;
-					$ptbkflg = ($back =~ /^\./);
-				}
-			}
-			# then see if we have a translation
-			if ($x=$trans{$_})
-			{	if($front)
-				{	&pusho($front);
-					if($front !~ /[\w]$/) {$appendflg=1;}
-				}
-	
-				&pusho($x);
-					
-				if($x =~ /\.$/) { $back =~ s/^\.//; } # only 1 .
-				if($back)
-				{	if($back !~ /^[\w]/) {&appendo($back);}
-					else {&pusho($back);}
-				}
-				next;
-			}
-
-	# eg. Cs, but not As Is Ms Us
-			if(($back =~ /^_s/) && /^[B-HJ-LN-TV-Z]$/)  
-			{	if($front)
-				{	&pusho($front);
-					if($front !~ /[\w]$/) {$appendflg=1;}
-				}
-	
-				&pusho("$_.");
-	
-				if($back)
-				{	if($back !~ /^[\w]/) {&appendo($back);}
-					else {&pusho($back);}
-				}
-				next;
-			}
-
-	# split x.y.
-	$_ .= '.' if $ptbkflg;	# NOTE THIS CHANGES $_ FOR FUTURE MATCHES
-				# but it has no more uses in this loop,
-				# so this _should_ be okay.
-	if (/^([a-zA-Z]\.)+([sS]?)$/)
-	{
-	    $sflag = $2;	# remember if plural (as opposed to a.s.)
-
-	    chop if $ptbkflg;	# trim period that we just added
-
-	    s/\./. /g;		# x.y. -> x. y.
-
-	    s/ ([sS])$/$1/ if $sflag;	# reattach final "s"
-
-	    if($front) 
-	    {	&pusho($front);
-		if($front !~ /[\w]$/) {$appendflg=1;}
-	    }
-	
-	    &pusho($_);
-
-	    if($back)
-	    {	if($back !~ /^[\w]/) {&appendo($back);}
-		else {&pusho($back);}
-	    }
-	    next;
-	}
-
-	# remaining tokens are passed "as is"
-	# [Below does "&pusho($input[$field]);" but faster, since we avoid
-	# the subroutine call for the most common case.]
-	push(@output,$input[$field]);
-    }
-
-    $_=join(" ",@output);
-
-    # if($vflg) {print "ab:\t$_\n";}
-
-    #########################  lettproc  ######################################
-    if (/\b[b-zB-HJ-Z]\b/)
-    {
-	@output = split(/\s+/);
-
-	foreach(@output)
-	{
-	    next unless /^\W*[b-zB-HJ-Z]\W*$/;
-
-	    #if($vflg) {print "le: $_\n";}
-
-	    # some cases to skip/pre-change.  (Note that backslashing of
-	    # quotes is for the sake of Emacs, not Perl.)
-	    next if (/^[\'][nN]$/);		# Spic \'n Span
-
-	    s/(^[\`\'][nN])[\`\']$/$1/ && next;	# Rock 'n' Roll: 'n' -> \'n
-
-	    s/^[\`\'\"]R[\'\`\"]$/"R"/ && next;	# Toys "R" Us
-
-	    next if (/^o\'$/);			# Man o\' War
-
-	    # put . at end of remaining single-letter words
-	    s/^(\W*)([b-zB-HJ-Z])([^.\w]\W*|[^\w.]*)$/$1$2.$3/;
-	}
-	
-	$_=join(" ",@output);
-    }
-
-    s/\s+/ /g;
-    s/^ //;
-    s/ $//;
-
-    s/ _//g;	# attach final s for Cs or AFLs
-    s/_//g;	# clear _
-    s/ - /-/g;
-
-    print $_,"\n" if $_;
-}
-
-sub pusho				# pusho($x): push output
-{	if($appendflg)			# global: used for fronts
-	{
-		&appendo(@_[0]);
-	}
-	else {push(@output,@_);}
-}
-
-sub appendo				# appendo($x): append to output
-{	$appendflg=0;		
-	if($#output < 0) {&perr("appendo: output empty");}
-	$output[$#output] .= @_[0];
-}
-
-sub geto				# geto(): get last output
-{	if($#output < 0) {print STDERR ("geto: output empty\n");}
-	return $output[$#output];
-}
-
-sub perr
-{	print STDERR "abbrevproc: $_[0]\n";
-	exit(1);
-}
-
-sub perr2
-{	print STDERR "abbrevproc: $_[0]\n";
-}
-
-sub setupRoman
-{
-    $Roman{I}="one";
-    $Roman{II}="two";
-    $Roman{III}="three";
-    $Roman{IV}="four";
-    $Roman{V}="five";
-    $Roman{VI}="six";
-    $Roman{VII}="seven";
-    $Roman{VIII}="eight";
-    $Roman{IX}="nine";
-    $Roman{X}="ten";
-    $Roman{XI}="eleven";
-    $Roman{XII}="twelve";
-    $Roman{XIII}="thirteen";
-    $Roman{XIV}="fourteen";
-    $Roman{XV}="fifteen";
-    $Roman{XVI}="sixteen";
-    $Roman{XVII}="seventeen";
-    $Roman{XVIII}="eighteen";
-    $Roman{XIX}="nineteen";
-    $Roman{XX}="twenty";
-    $Roman{XXI}="twenty-one";
-    $Roman{XXII}="twenty-two";
-    $Roman{XXIII}="twenty-three";
-    $Roman{XXIV}="twenty-four";
-    $Roman{XXV}="twenty-five";
-    $Roman{XXVI}="twenty-six";
-    $Roman{XXVII}="twenty-seven";
-    $Roman{XXVIII}="twenty-eight";
-    $Roman{XXIX}="twenty-nine";
-    $Roman{XXX}="thirty";
-    $Roman{XXXI}="thirty-one";
-    $Roman{XXXII}="thirty-two";
-    $Roman{XXXIII}="thirty-three";
-    $Roman{XXXIV}="thirty-four";
-    $Roman{XXXV}="thirty-five";
-}
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/addressforms b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/addressforms
deleted file mode 100644
index f3dcdddea7b..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/addressforms
+++ /dev/null
@@ -1,38 +0,0 @@
-# abbreviation list used for WSJ0 (pilot) processing
-# generated by Doug Paul, MIT/LL
-# derived from unigram file 29 Aug 91 mods to 17 Sept 91
-
-# true abbreviations (must end with .)
-# if key includes lower case, an upper case version will be created
-Adm.	Admiral
-Brig.	Brigadeer
-Capt.	Captain
-Cmdr.	Commander
-Col.	Colonel
-Cpl.	Corporal
-Dr.	Doctor
-Drs.	Doctors
-Fr.	Friar
-Ft.	Fort
-Gen.	General
-Gov.	Governor
-Lt.	Lieutenant
-Maj.	Major
-Mr.	Mister
-Mrs.	Mistress
-Ms.	Miz
-Messrs. Misters
-Prof.	Professor
-Prop.	Proposition
-Pte.	Point
-Pvt.	Private
-Rep.	Representative
-Reps.	Representatives
-Rev.	Reverend
-Sen.	Senator
-Sens.	Senators
-Sgt.	Sargent
-St.	Saint
-Ste.	Saint
-vs.	versus
-v.	versus
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/artfilter.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/artfilter.perl
deleted file mode 100755
index ed464e4a31d..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/artfilter.perl
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/perl
-
-# artfilter.perl 
-
-# This perl script can be used to (de)select articles from TIPSTER
-# format newswire data on the basis of the content of a specific
-# tagged element.  This version allows a number of string patterns
-# (drawn from a separate input file) to be checked against the content
-# of a chosen tag, and allows residue articles to be sent to a
-# separate file (in addition to having selected articles written to
-# stdout).
-
-require "newgetopt.pl";
-$cmd_okay = &NGetOpt( 't=s', 'p=s', 'f=s', 'r=s', 'v', 'x' );
-$arg_okay = ( $opt_t ne "" && ( $opt_p ne "" || $opt_f ne "" ));
-
-if ( ! $cmd_okay || ! $arg_okay ) {
-    print 
-"\nUsage: artfilter.perl -t tag [-p ptrn | -f ptrns] [-r resid] [-vx] [infile]\n";
-    print "  writes DOCs with <tag> containing /ptrn(s)/ to stdout\n";
-    print "  -v = select DOCs NOT containing /ptrn(s)/ in <tag>\n";
-    print "  -x = exclude DOCs that do not contain <tag>\n";
-    print "  -r = write residue DOCs to resid file\n";
-    exit;
-}
-
-@patrns = ();
-if ( $opt_f ne "" ) {
-    open( PATRNS, "<$opt_f" );
-    while (<PATRNS>) {
-	chop;
-	push( @patrns, $_ );
-    }
-} else {
-    push( @patrns, $opt_p );
-}
-close PATRNS;
-
-if ( $opt_r ) {
-    open( RESID, ">$opt_r" );
-}
-
-$outputOn = $foundtag = 0;
-
-while (<>) 
-{
-    if ( /<DOC[ >]/ ) {
-	$artbuf = $_;
-	$outputOn = 1;
-    }
-    elsif ( /<\/DOC>/ ) {
-	if ( $outputOn ) {
-	    $artbuf .= $_;
-	    if ( $outputOn == 1 && ( ! $opt_x || $foundtag )) {
-		print $artbuf;
-	    } elsif ( $opt_r && ( ! $opt_x || $foundtag )) {
-		print RESID $artbuf;
-	    }
-	    $outputOn = 0;
-	}
-	$foundtag = 0;
-    }
-    elsif ( $outputOn ) {
-	$artbuf .= $_;
-	if ( /\<$opt_t/ ) {
-	    $foundtag = 1;
-	    $tagdata = $_;
-	    while ( $tagdata !~ /\<\/$opt_t/ ) {
-		$_ = <>;
-		$artbuf .= $_;
-		$tagdata .= $_;
-	    }
-	    foreach $ptn ( @patrns ) {
-		last if (( $i = ( $tagdata !~ /$ptn/ )) == 0 );
-	    }
-	    if ( $i ^ $opt_v ) { $outputOn = ( $opt_r ) ?  2 : 0; }
-	}
-    }
-}
-
-if ( $opt_r ) {
-    close RESID;
-}
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/bugproc.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/bugproc.perl
deleted file mode 100755
index 48acad96c4e..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/bugproc.perl
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/perl
-# $Id: bugproc.perl,v 1.4 1996/08/21 23:55:40 robertm Rel $
-###############################################################################
-# This software is being provided to you, the LICENSEE, by the Massachusetts  #
-# Institute of Technology (M.I.T.) under the following license.  By           #
-# obtaining, using and/or copying this software, you agree that you have      #
-# read, understood, and will comply with these terms and conditions:          #
-#                                                                             #
-# Permission to use, copy, modify and distribute, including the right to      #
-# grant others the right to distribute at any tier, this software and its     #
-# documentation for any purpose and without fee or royalty is hereby granted, #
-# provided that you agree to comply with the following copyright notice and   #
-# statements, including the disclaimer, and that the same appear on ALL       #
-# copies of the software and documentation, including modifications that you  #
-# make for internal use or for distribution:                                  #
-#                                                                             #
-# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
-# reserved.                                                                   #
-#                                                                             #
-# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
-# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
-# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
-# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
-# TRADEMARKS OR OTHER RIGHTS.                                                 #
-#                                                                             #
-# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
-# used in advertising or publicity pertaining to distribution of the          #
-# software.  Title to copyright in this software and any associated           #
-# documentation shall at all times remain with M.I.T., and USER agrees to     #
-# preserve same.                                                              #
-###############################################################################
-
-# bugproc.comm
-# Removes some bugs common to all sources.
-# This script has no source-dependencies.
-
-while(<>)
-{
-    if ( /^</ ) {  # pass all tag lines intact;
-	print;
-	next;
-    }
-
-    s/(\w)\(/$1 (/g;			# eg. x( -> x (
-    s/\)(\w)/) $1/g;			# eg. )x -> ) x;
-
-    s/(\d)\((\d)/$1 ($2/g;			# \d(\d
-    s/(\d)\)(\d)/$1) $2/g;			# \d)\d;
-    s/([a-zA-Z]{2,}\.)(\d)/$1 $2/g;		# eg. Sept.30
-    s/,([a-zA-Z])/, $1/g;			# eg. 20,Smith
-    s/(\W)milion(\W)/$1million$2/g;		# spelling err
-
-    s/(\W&\s*)Co([^\w\.-])/$1Co.$2/g;		# "& Co" -> "& Co."
-    s/(\WU\.S)([^\.\w])/$1.$2/g;		# U.S -> U.S.
-
-    # next block added for Broadcast News archive processing
-    s/\$ +(\d)/\$$1/g;		# e.g. "$ 5" -> "$5"
-    s/\$\#/\$/g;		# e.g. "$#5" -> "$5" (typo??)
-    s/\#/number /g;		# in bc-news, "#" = "number" not "pound"
-    s=([^\s</])(/+)\s=$1 $2 =g;	# e.g. "2002/ " -> "2002 / "
-    s=([0-9])/1,000([^0-9,])=$1/1000$2=g; # e.g. "1/1,000" -> "1/1000"
-
-    s/\s{2,}/ /g;
-    s/^ //;
-    s/\s*$/ \n/;
-
-    print;
-}
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/do-lm b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/do-lm
deleted file mode 100755
index 6a4f66eef4e..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/do-lm
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/sh
-# $Id: do-lm,v 1.3 1996/08/23 22:43:23 robertm Rel $
-Usage()
-{
-cat << EOM 1>&2
-Usage: $0 file(s)
-  Runs LM pipeline on FILES, with output to "lm" subdirectory of cwd.
-  Expects to find LM conditioning tools in PATH or ./bin.
-EOM
-}
-
-# Excludes "fixvp" stage which has the main effect of killing off
-# any SGML tagging that contains a space, e.g. <p id=...>.
-
-# BBN used -np switch for puncproc, removing punctuation; this chooses the
-# "verbalize" option instead.
-
-# Includes new "numhack" module to deal with zip codes and phone numbers.
-
-if [ $# -eq 0 ] || [ $1 = "-h" ]; then
-	Usage
-	exit 1
-fi
-
-dir=$1
-shift
-
-for file in $*
-do
-	BASENM=`basename $file`
-  name="${BASENM%.*}"
-
-	echo "Running LM pipeline for |$BASENM|..." 1>&2
-	set -x
-  gunzip -c $file | pare-sgml.perl | \
-    bugproc.perl | \
-    numhack.perl | \
-    numproc.perl -xlocal/data_prep/csr_hub4_utils/num_excp | \
-    abbrproc.perl local/data_prep/csr_hub4_utils/abbrlist | \
-    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
-	set +x
-	echo "Done with $BASENM."
-done
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
deleted file mode 100644
index d6e34eb7357..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns
+++ /dev/null
@@ -1,4 +0,0 @@
-ABCPrimetime Live
-CNNMorning News
-CNNWorld View
-NPRMorning Edition
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/num_excp b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/num_excp
deleted file mode 100644
index 0f93e6ae51c..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/num_excp
+++ /dev/null
@@ -1,528 +0,0 @@
-###############################################################################
-# This software is being provided to you, the LICENSEE, by the Massachusetts  #
-# Institute of Technology (M.I.T.) under the following license.  By           #
-# obtaining, using and/or copying this software, you agree that you have      #
-# read, understood, and will comply with these terms and conditions:          #
-#                                                                             #
-# Permission to use, copy, modify and distribute, including the right to      #
-# grant others the right to distribute at any tier, this software and its     #
-# documentation for any purpose and without fee or royalty is hereby granted, #
-# provided that you agree to comply with the following copyright notice and   #
-# statements, including the disclaimer, and that the same appear on ALL       #
-# copies of the software and documentation, including modifications that you  #
-# make for internal use or for distribution:                                  #
-#                                                                             #
-# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
-# reserved.                                                                   #
-#                                                                             #
-# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
-# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
-# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
-# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
-# TRADEMARKS OR OTHER RIGHTS.                                                 #
-#                                                                             #
-# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
-# used in advertising or publicity pertaining to distribution of the          #
-# software.  Title to copyright in this software and any associated           #
-# documentation shall at all times remain with M.I.T., and USER agrees to     #
-# preserve same.                                                              #
-###############################################################################
-
-# exceptions: list of numbers to be expanded in exceptional ways
-# derived by manual scan of early unigram file
-# executed BEFORE numproc
-#
-# comments indicated by "#" in the first column
-
-# years
-'20s	twenties
-'30s	thirties
-'40s	forties
-'50s	fifties
-'60s	sixties
-'70s	seventies
-'80s	eighties
-'90s	nineties
-
-# processors
-8086	eighty eighty-six
-186	one eighty-six
-286	two eighty-six
-386	three eighty-six
-486	four eight-six
-187	one eighty-seven
-287	two eighty-seven
-387	three eighty-seven
-80286	eighty two eighty-six
-80386	eighty three eighty-six
-80486	eighty four eighty-six
-3090	thirty ninety
-68020	sixty-eight oh twenty
-68030	sixty-eight oh thirty
-
-# aircraft
-707	seven oh seven
-707s	seven oh sevens
-707's	seven oh seven's
-727	seven twenty-seven
-727s	seven twenty-sevens
-727's	seven twenty-seven's
-737	seven thirty-seven
-737s	seven thirty-sevens
-737-100	seven thirty-seven -- one hundred
-737-100s	seven thirty-seven -- one hundreds
-737-200	seven thirty-seven -- two hundred
-737-200s	seven thirty-seven -- two hundreds
-737-205	seven thirty-seven -- two oh five
-737-300	seven thirty-seven -- three hundred
-737-300s	seven thirty-seven -- three hundreds
-737-400	seven thirty-seven -- four hundred
-737-400s	seven thirty-seven -- four hundreds
-737-500	seven thirty-seven -- five hundred
-737-500s	seven thirty-seven -- five hundreds
-737-500's	seven thirty-seven -- five hundred's
-747	seven forty-seven
-747s	seven forty-sevens
-747's	seven forty-seven's
-747F	seven forty-seven F.
-747-100	seven forty-seven -- one hundred
-747-100s	seven forty-seven -- one hundreds
-747-124SF	seven forty-seven -- one twenty four S. F.
-747-200	seven forty-seven -- two hundred
-747-200s	seven forty-seven -- two hundreds
-747-200's	seven forty-seven -- two hundred's
-747-200B	seven forty-seven -- two hundred B.
-747-200F	seven forty-seven -- two hundred F.
-747-273	seven forty-seven -- two seventy-three
-747-300	seven forty-seven -- three hundred
-747-341B	seven forty-seven -- three forty-one B.
-747-400	seven forty-seven -- four hundred
-747-400s	seven forty-seven -- four hundreds
-747-500	seven forty-seven -- five hundred
-747-500s	seven forty-seven -- five hundreds
-747-500's	seven forty-seven -- five hundred's
-757	seven fifty-seven
-757s	seven fifty-sevens
-757's	seven fifty-seven's
-757-200	seven fifty-seven -- two hundred
-757-200s	seven fifty-seven -- two hundreds
-757-225	seven fifty-seven -- two two five
-757-232s	seven fifty-seven -- two three twos
-757-767	seven fifty-seven - seven sixty-seven
-767	seven sixty-seven
-767s	seven sixty-sevens
-767-200	seven sixty-seven -- two hundred
-767-200s	seven sixty-seven -- two hundreds
-767-200ER	seven sixty-seven -- two hundred E R
-767-300	seven sixty-seven -- three hundred
-767-300s	seven sixty-seven -- three hundreds
-767-300ER	seven sixty-seven -- three hundred E R
-767-300-ER	seven sixty-seven -- three hundred E R
-
-A310	A. three ten
-A320	A. three twenty
-A330	A. three thirty
-A340	A. three forty
-A-310	A. three ten
-A-320	A. three twenty
-A-330	A. three thirty
-A-340	A. three forty
-A310s	A. three tens
-A320s	A. three twenties
-A330s	A. three thirties
-A340s	A. three forties
-A-310s	A. three tens
-A-320s	A. three twenties
-A-330s	A. three thirties
-A-340s	A. three forties
-
-1011	ten eleven
-1011s	ten elevens
-
-MD-80	M. D. eighty
-
-# misc
-#8mm	eight millimeter
-#35mm	35 millimeter
-gp120	g. p. one-twenty
-240SX	two forty S. X.
-RU486	R. U. four eighty-six
-RU-486	R. U. four eighty-six
-
-3Com	three Com
-3COM	three COM
-3Com's	three Com's
-3COM's	three COM's
-
-# serial number mode words
-# marked by initial * (stripped in numproc)
-
-*year
-*VAX
-*Up
-*mm
-*ish
-*point
-*May
-*Station
-*inch
-*ers
-*and
-*mark
-*sec
-*stock
-*mid
-*pre
-*dBase
-*Co
-
-# right contexts for dollar
-$accord
-$account
-$acquisition
-$ad
-$addition
-$additional
-$advance
-$agreement
-$aid
-$Air
-$airport
-$allowance
-$amount
-$annual
-$appropriation
-#	"apartment" and "apartments" should be fixed, but would alter v1.0
-#$apartment
-#$apartments
-$area
-$arms
-$Army
-$arrangement
-$asset
-$Atari
-$auction
-$average
-$award
-$backlog
-$bailout
-$balance
-$bank
-$bankroll
-$barrier
-$base
-$based
-$benchmark
-$bid
-$bill
-$bills
-$bond
-$bonds
-$bonus
-$book
-$bridge
-$budget
-$building
-$Burger
-$business
-$buyout
-$campaign
-$cap
-$capital
-$car
-$ceiling
-$charge
-$check
-$checks
-$claim
-$Clean
-$coffeepot
-$coffeepots
-$company
-$companies
-$compensation
-$complex
-$computer
-$consortium
-$construction
-$consulting
-$contract
-$contracts
-$contribution
-$contributions
-$convertible
-$cost
-$costs
-$court
-$credit
-$cumulative
-$cut
-$deal
-$debenture
-$debentures
-$debt
-$decline
-$decrease
-$deductible
-$default
-$Defense
-$defense
-$defensive
-$deficit
-$denominations
-$deposit
-$development
-$difference
-$disallowance
-$Distillers
-$dividend
-$domestic
-$donor
-$donors
-$drop
-$effort
-$emergency
-$endowment
-$energy
-$equity
-$estate
-$estimate
-$Eurobond
-$exemption
-$expansion
-$expense
-$face
-$facility
-$fare
-$federal
-$fee
-$fence
-$Fidelity
-$figure
-$financing
-$fine
-$fines
-$First
-$foreign
-$FSLIC
-$fund
-$funds
-$gain
-$gains
-$gap
-$goal
-$gold
-$grant
-$guarantee
-$hammer
-$hammers
-$highway
-$home
-$Hong
-$hostile
-$house
-$income
-$increase
-$industry
-$infusion
-$initial
-$installment
-$investment
-$issue
-$issues
-$judgment
-$junk
-$Kansai
-$laboratory
-$lawsuit
-$LBO
-$legal
-$letter
-$level
-$leveraged
-$liability
-$limit
-$line
-$litigation
-$loan
-$loans
-$loss
-$machine
-$mark
-$market
-$maximum
-$measure
-$merger
-$Midland
-$minimum
-$mortgage
-$Navy
-$net
-$note
-$notes
-$obligation
-$obligations
-$offer
-$offering
-$offerings
-$office
-$order
-$outlay
-$package
-$pact
-$payout
-$payment
-$payments
-$penalty
-$Pennzoil
-$pension
-$Pentagon
-$pipeline
-$plan
-$plant
-$portion
-$premium
-$price
-$principal
-$prize
-$proceeds
-$production
-$profit
-$program
-$project
-$proposal
-$provision
-$purchase
-$purse
-$Putnam
-$question
-$range
-$rate
-$reactor
-$reactors
-$rebate
-$rebates
-$recapitalization
-$record
-$redemption
-$reduction
-$refund
-$renovation
-$request
-$rescue
-$research
-$reserve
-$restructuring
-$retirement
-$revolving
-$rise
-$River
-$salary
-$sale
-$sales
-$Saturn
-$savings
-$series
-$settlement
-$share
-$shelf
-$shortage
-$shortfall
-$software
-$spacecraft
-$special
-$stake
-$station
-$stock
-$study
-$suit
-$suits
-$sum
-$surge
-$surplus
-$system
-$tab
-$takeover
-$takeover
-$target
-$tax
-$Templeton
-$tender
-$threshold
-$toilet
-$total
-$trade
-$transaction
-$trigger
-$trust
-$value
-$venture
-$verdict
-$vessel
-$Waterford
-$windfall
-$wine
-$Winsor
-$world
-$World
-
-# skip before right context for dollar
-$$advertising
-$$asking
-$$civil
-$$closing
-$$commercial
-$$common
-$$compensatory
-$$Contra
-$$corporate
-$$damage
-$$economic
-$$energy
-$$European
-$$first
-$$general
-$$global
-$$government
-$$housing
-$$insurance
-$$interest
-$$interim
-$$international
-$$junior
-$$libel
-$$marketing
-$$municipal
-$$nationwide
-$$new
-$$nuclear
-$$omnibus
-$$outstanding
-$$personal
-$$pretax
-$$private
-$$projected
-$$proposed
-$$public
-$$punitive
-$$real
-$$retail
-$$refunding
-$$refinancing
-$$retirement
-$$revenue
-$$second
-$$secured
-$$security
-$$semi-annual
-$$senior
-$$space
-$$state
-$$State
-$$stated
-$$taxable
-$$term
-$$testing
-$$thrift
-$$trading
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numhack.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numhack.perl
deleted file mode 100755
index be8e611a2b0..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numhack.perl
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/perl
-
-# $Id: numhack.perl,v 1.4 1996/08/23 05:12:27 robertm Rel $
-# preprocessor for numproc, potentially specialized for Broadcast News material
-
-# tries to patch numproc's problems with:
-#	- telephone numbers
-#	- zip codes
-# for example:
-#   1-800-555-1212
-#     =>  one - eight hundred -  five five five -  one two one two
-#   (215) 555-1212
-#     =>  two one five -  five five five -  one two one two
-#   212/285-9400
-#     =>  two one two -  two eight five -  nine four zero zero
-#   1-(800)-CAR-CASH
-#     =>  one - eight hundred -CAR-CASH
-#   New York, NY 10007
-#     =>  New York, NY  one zero zero zero seven
-#   Philadelphia, PA 19104-6789
-#     =>  Philadelphia, PA  one nine one oh four -  six seven eight nine
-
-# may leave behind extra spaces here and there, but later processes ought
-# to correct that...
-
-@ones_oh=("oh","one","two","three","four",
-	  "five","six","seven","eight","nine");
-
-while(<>)
-{
-    next unless /\d/;		# skip lines without numbers
-    next if /^<\/?[aps]/;	# skip SGML
-
-    # probable Zip codes
-    s/\b(\d{5}-\d{4})\b/&SpellDigits($1)/eg;	# 12345
-    s/\b(\d{5})\b/&SpellDigits($1)/eg;		# 12345-6789
-
-    # phone numbers
-    s=(^| )([1l][- ])?\(?([2-9]\d{2})\)?[-/]? ?(\d{3})-(\d{4})\b=&SpellTel($2,$3,$4,$5)=eg; # 215-555-1212 etc.
-    s/(^| )(\d{3}-\d{4})\b/&SpellDigits($2)/eg;	# 555-1212
-    s/\b1-\(?800\)?(\W)/ one - eight hundred $1/g;	# isolated 1-800
-    s/([Aa]rea code) (\d{3})(\W)/"$1 ".&SpellDigits($2)."$3"/eg;
-
-} continue {
-    print;
-}
-
-exit;
-
-sub SpellDigits
-{
-    local($num)=$_[0];
-    $num =~ s/(\d)(\D)(\d)/$1 $2 $3/g; # add space around non-digits
-    # isolated zeros become "oh", string of them become "zero ..."
-    $num =~ s/(00+)/" zero" x length($1)/eg;
-    $num =~ s/(\d)/" $ones_oh[$1]"/eg;
-    return $num;
-}
-
-sub SpellTel
-{
-    local($pre,$area,$exch,$rest)=@_;
-    $return = $pre ? " one -" : " ";
-    if ($area =~ /(\d)00/)
-    {
-	$return .= &SpellDigits($1);
-	$return .= " hundred";
-    }
-    else
-    {
-	$return .= &SpellDigits($area);
-    }
-    $return .= " - ";
-
-    $return .= &SpellDigits($exch);
-    $return .= " - ";
-    $return .= &SpellDigits($rest);
-
-    return $return;
-}
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numproc.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numproc.perl
deleted file mode 100755
index 5cfc34132ec..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/numproc.perl
+++ /dev/null
@@ -1,1133 +0,0 @@
-#! /usr/bin/perl
-#
-# $Id: numproc.perl,v 1.7 1996/08/23 05:04:11 robertm Rel $
-###############################################################################
-# This software is being provided to you, the LICENSEE, by the Massachusetts  #
-# Institute of Technology (M.I.T.) under the following license.  By           #
-# obtaining, using and/or copying this software, you agree that you have      #
-# read, understood, and will comply with these terms and conditions:          #
-#                                                                             #
-# Permission to use, copy, modify and distribute, including the right to      #
-# grant others the right to distribute at any tier, this software and its     #
-# documentation for any purpose and without fee or royalty is hereby granted, #
-# provided that you agree to comply with the following copyright notice and   #
-# statements, including the disclaimer, and that the same appear on ALL       #
-# copies of the software and documentation, including modifications that you  #
-# make for internal use or for distribution:                                  #
-#                                                                             #
-# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
-# reserved.                                                                   #
-#                                                                             #
-# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
-# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
-# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
-# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
-# TRADEMARKS OR OTHER RIGHTS.                                                 #
-#                                                                             #
-# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
-# used in advertising or publicity pertaining to distribution of the          #
-# software.  Title to copyright in this software and any associated           #
-# documentation shall at all times remain with M.I.T., and USER agrees to     #
-# preserve same.                                                              #
-###############################################################################
-
-# preprocessor for WSJ
-# assumes 1 sentence per line
-#
-# 1.  expand numerical exceptions: eg. 386
-# 2.  do regular numerical expansions
-
-# Minor modifications by David Graff, Linguistic Data Consortium, in preparation
-# for publishing on cdrom;  Aug. 11, 1994.
-
-$POINT='.POINT';		# orthographic notation for .
-
-	# final s in name indicates plural version, otherwise just add s
-@ones_z=("zero","one","two","three","four",
-	"five","six","seven","eight","nine");
-@ones_oh=("oh","one","two","three","four",
-	"five","six","seven","eight","nine");
-@ten=("","ten","twenty","thirty","forty","fifty",
-	"sixty","seventy","eighty","ninety");
-@teen=("ten","eleven","twelve","thirteen","fourteen","fifteen",
-	"sixteen","seventeen","eighteen","nineteen");
-@mult=("","thousand","million","billion","trillion"
-	,"quadrillion","quintillion","sextillion","septillion","octillion");
-@den=("","","half","third","quarter","fifth",
-	"sixth","seventh","eighth","ninth","tenth",
-	"eleventh","twelfth","thirteenth","fourteenth","fifteenth",
-	"sixteenth","seventeenth","eighteenth","nineteenth");
-@largeden=("","first","second","third","fourth","fifth",
-	"sixth","seventh","eighth","ninth","tenth",
-	"eleventh","twelfth","thirteenth","fourteenth","fifteenth",
-	"sixteenth","seventeenth","eighteenth","nineteenth");
-@ordnal=("","first","second","third","fourth","fifth",
-	"sixth","seventh","eighth","ninth","tenth",
-	"eleventh","twelfth","thirteenth","fourteenth","fifteenth","sixteenth");
-@months=("Jan.","Feb.","Mar.","Apr.","Jun.","Jul.","Aug.","Sept.","Oct.",
-	"Nov.","Dec.","January","February","March","April","May","June",
-	"July","August","September","October","November","December");
-
-$exfile="$ENV{HOME}/bc-news/bin/num_excp";		# default exceptions file name
-
-for($i=0,$j=0;$i<=$#ARGV;$i++)
-{	if($ARGV[$i] =~ /^-/)
-	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
-		elsif($ARGV[$i] =~ /^-x/)
-		{	$exfile=$ARGV[$i];
-			$exfile =~ s/^-x//;
-		}
-		else {&perr2("illegal flag: $ARGV[$i]");}
-	}
-	else { &perr2("no file args"); }
-}
-@ARGV=();
-
-if(!exfile) {&perr2("no exceptions file specified"); }
-
-if(!open(EXFILE,$exfile)) {&perr2("cannot open $exfile"); }
-while(<EXFILE>)
-{	if(/^#/) {next;}	# comment
-	s/\n//;
-	if(!$_) {next;}		# blank
-	$y=$_;
-	s/^(\S+)\s*//;		# extract 1st word
-	$x=$1;
-	if($x eq "") {&perr2("$exfile: no word: $y");}
-	if($x =~ /^\$\$/)		# $$word => skip
-	{	$x =~ s/^\$*//;
-		$sing_dollar{$x}=2;
-	}
-	elsif($x =~ /^\$/)		# $word => singular right context
-	{	$x =~ s/^\$*//;
-		$sing_dollar{$x}=1;
-	}
-	elsif($x =~ /^\*/)
-	{	$x =~ s/\**//g;
-		if(!$x) {&perr2("$exfile: no serno word");}
-		$sernowd{$x}=1;		# serial no words
-	}
-	else
-	{	if($x !~ /\d/) {&perr2("$exfile: non-numerical key");}
-		if(!$_) {&perr2("$exfile: no value");}
-
-		$except{$x}=$_;		# translations
-	}
-	$n++;
-}
-close(EXFILE);
-if($vflg) {print STDERR "$n lines read from exceptions file\n";}
-
-for($i=0;$i<=$#months;$i++)	# make months hash
-{	$_=$months[$i];
-	$months{$_}=1;		# mixed case
-	tr/a-z/A-Z/;
-	$months{$_}=1;		# UC
-}
-
-while(<>)
-{	# removed local($front.$back,$x) to conserve memory RWM 8/96
-
-##############################  exceptproc  ##################################
-	s/^\s*//;
-	s/\n//o;
-	if($vflg) {print "input:\t$_\n";}
-	if(/\d/ && !/^<\/?[spa]/)		# opt and protect sgml
-	{	@input = split(/\s+/o);
-		@output=();
-		for($field=0;$field<=$#input;$field++)	# $field is global
-		{	$_=$input[$field];
-	
-			if(!/\d/)			# only processes numbers
-			{	&pusho($input[$field]);		# not processed
-				next;
-			}
-	
-			s/^(\W*)//o;		# strip front
-			$front=$1;
-			if($front =~ /\$$/ || $front =~ /#$/)	# protect money
-			{	&pusho($input[$field]);		# not processed
-				next;
-			}
-	
-			s/(\W*)$//o;		# strip back
-			$back=$1;
-	
-			if($front =~ /\'$/ && $except{"'$_"})	# eg "'20s"
-			{	$front =~ s/\'$//;
-				if($front) 
-				{	&pusho($front);
-					if($front !~ /[\w]$/o) {$appendflg=1;}
-				}
-	
-				&pusho($except{"'$_"});		# translation
-					
-				if($back)
-				{	if($back !~ /^[\w]/o) {&appendo($back);}
-					else {&pusho($back);}
-				}
-			}
-			elsif($except{$_})
-			{	if($front) 
-				{	&pusho($front);
-					if($front !~ /[\w]$/o) {$appendflg=1;}
-				}
-	
-				&pusho($except{$_});		# translation
-					
-				if($back)
-				{	if($back !~ /^[\w]/o) {&appendo($back);}
-					else {&pusho($back);}
-				}
-			}
-			else {&pusho($input[$field]);}		# not processed
-		}
-		$_=join(" ",@output);
-	}
-	s/\s+/ /g;
-	s/^ //o;
-	s/ $//o;
-	if($vflg) {print "ex:\t$_\n";}
-
-############################  numproc  ########################################
-	if(!/^<\/?[spa]/)			# protect sgml, also art
-	{	s/(\d+)-(\d+)-(\d+)/$1 - $2 - $3/g;	# eg. 1-2-3
-		s/(\d+)x(\d+)/$1 by $2/g;		# eg. 2x4
-		s/(\d+)\+(\d+)/$1 plus $2/g;		# eg. 2+2
-		s=(\d)-(\d)[/\\](\d)=$1 $2/$3=g;	# e.g. 3-1/2
-		s=(\d)\\(\d)=$1/$2=g;			# e.g. 1\2 for 1/2
-		s/\$(\d[\d,]*)-\$(\d)/$1 to \$$2/g;	# $ range: eg. $1-$2
-		s/\$(\d[\d,]*)-(\d)/$1 to \$$2/g;	# $ range: eg. $1-2
-		s/(\d)-(\'?)(\d)/$1 to $2$3/g;		# range: eg. 1-2
-		s/%-(\d)/% to $1/g;			# % range: eg. 1%-2%
-		s/(\d)=(\d)/$1 equals $2/g;		# equation: x=y
-		s/ - / -- /g;				# recode dashes
-		s/([^-\d\s])-([^-\d\s])/$1 - $2/g;	# split in-word hyphens
-		s/- +-/--/g; s/- +-/--/g;		# close dashes
-		s/-{3,}/--/g;				# map dashes to --
-		s/--/ -- /g;				# space around --
-		s/(\d) +(\d+\/\d)/$1 and $2/g;	      # dig frac -> dig and frac
-		s/([a-zA-Z])\//$1 \/ /g;		# text/*
-		s/\/([a-zA-Z])/ \/ $1/g;		# */text
-
-		s/([a-zA-Z]\d+)\/(\d+)/$1 \/ $2/g;	# eg. a1/3 -> a1 / 3
-		s/(\/\d*)th/$1/ig;			# eg. 1/10th -> 1/10
-		s/(\/\d*1)st/$1/ig;			# eg. 1/x1st -> 1/x1
-		s/(\/\d*2)nd/$1/ig;			# eg. 1/x2nd -> 1/x2
-		s/(\/\d*3)rd/$1/ig;			# eg. 1/x3rd -> 1/x3
-		s/(\d+)\/(\d+[a-zA-Z])/$1 \/ $2/g;	# eg. 1/3a -> 1 / 3a
-		s/([a-zA-Z])-(19\d\d\D)/$1 - $2/g;  # eg. mid-1990 -> mid - 1990
-#		s/([a-zA-Z])-(\d)/$1 $2/g;		# eg. a-1 -> a 1
-#		s/(\d)-([a-zA-Z])/$1 $2/g;		# eg. 1-a -> 1 a
-		s/([a-zA-Z])-(\d)/$1 - $2/g;		# eg. a-1 -> a - 1
-		s/(\d)-([a-zA-Z])/$1 - $2/g;		# eg. 1-a -> 1 - a
-	
-		# fix common time typo (; for :)
-		s/\b([012]?\d);([0-5]\d)\b/$1:$2/g;	# e.g. 11;00 -> 11:00
-
-		if(!/\d:\d\d$/o && !/\d:\d\d\D/o)    # preprocess non-time \d:\d
-		{	s/(\d):(\d)/$1 : $2/g;
-			s/(\S):(\d)/$1: $2/g;
-		}
-	}
-
-	if($vflg) {print "num1:\t$_\n";}
-
-	s/^\s*//;
-	if(/\d/ && !/^<\/?[spa]/)		# opt and protect sgml
-	{	@input = split(/\s+/o);
-		@output=();
-	for($field=0;$field<=$#input;$field++)	# $field is global
-		{	if($field>0) {$last=$input[$field-1];}
-			else {$last='';}
-			if($field<$#input) {$next=$input[$field+1];}
-			else {$next='';}
-			if($field<$#input-1) {$next2=$input[$field+2];}
-			else {$next2='';}
-			$this=$input[$field];
-			$_=$input[$field];
-	
-			if(/<[\w\.\/]*>/o && !/<p/o && !/<\/p>/o) # pass only
-				{&perr("spurious SGML: $_"); next; }	# <p... and </p>
-	
-			if(/[0-9]/o && !/<p/o)		# number but not <p
-			{	if(/[\$\#]/o)			# money
-					{if (! &money($_,$next)) {next;} }
-				elsif(/\d:\d\d$/o || /\d:\d\d\D/o)	# time
-					{if (! &printtime($_)) {next;} }
-				elsif(/\d+\/\d+\/\d+/o)		# x/x/x date
-					{if (! &printdate($_)) {next;} }
-				elsif((/[a-zA-Z].*\d/ || /\d.*[a-zA-Z]/)
-				      && 
-				      !(/\dth\W*/i || /1st\W*/i || /2nd\W*/i
-					|| /3rd\W*/i
-					|| (/\d\'?s\W*/
-					    && (! /\d[a-zA-Z]+\d+\'?s\W*$/))))
-					{if (! &printserno($_)) {next;} }	 # serial no
-				elsif(/\//o)			# fraction
-					{if (! &printfrac($_)) {next;} }
-				elsif(/\d\'-?\d+/o)		# ft inches
-					{if (! &printftin($_)) {next;} }
-				else {if (! &printnum($_)) {next;} }	      # ordinary number
-			}
-			else {&pusho($_ );}		# non-numeric string
-		}
-		$_=join(" ",@output);
-	}
-	s/^/ /o;
-	s/$/ /o;
-	s/ - /-/g;		# unspace hyphen
-	s/%/ % /g;
-	s/ {2,}/ /g;
-	s/^ //o;
-	s/ $//o;
-
-	if($_) {print "$_\n";}
-}
-
-sub money				# money($this,$next)
-{	$_=$_[0];		# $this
-	local($next)=$_[1];
-	if($vflg) {print "money: $_, $next\n";}
-
-	local($unit);
-	local($subunit_sing);
-	local($subunit_pl);
-	local($punct);
-	local($plural);
-	local($sing);
-	local($frac);
-	local($front);
-	local($back);
-	local($x);
-	local($y);
-	local($z);
-	local($i);
-	local($j);
-
-	s/\$\.(\d)/\$0.$1/g;	# patch numbers like $.22
-	if(/A\$/)				# $ stuff
-	{	($front)=/^(.*)A\$/;
-		s/A\$//;
-		$unit='Australian dollar';
-		$subunit_sing='cent';
-		$subunit_pl='cents';
-	}
-	elsif(/C\$/)
-	{	($front)=/^(.*)C\$/;
-		s/C\$//;
-		$unit='Canadian dollar';
-		$subunit_sing='cent';
-		$subunit_pl='cents';
-	}
-	elsif(/NZ\$/)
-	{	($front)=/^(.*)NZ\$/;
-		s/NZ\$//;
-		$unit='New Zealand dollar';
-		$subunit_sing='cent';
-		$subunit_pl='cents';
-	}
-	elsif(/US\$/)
-	{	($front)=/^(.*)US\$/;
-		s/US\$//;
-		$unit='U S dollar';
-		$subunit_sing='cent';
-		$subunit_pl='cents';
-	}
-	elsif(/\$/)
-	{	($front)=/^(.*)\$/;
-		s/\$//;
-		$unit='dollar';
-		$subunit_sing='cent';
-		$subunit_pl='cents';
-	}
-	elsif(/#/)				# pound
-	{	($front)=/^(.*)#/;
-		s/#//;
-		$unit='pound';
-		$subunit_sing='penny';
-		$subunit_pl='pence';
-	}
-	else {&perr("money: unknown currency"); return 0;}
-
-	($back)=/(\D*)$/;
-	$back =~ s/^s//;	# $40s -> $40
-
-	if($front) 
-	{	&pusho($front);			# generally punctuation
-		if($front !~ /\w$/) {$appendflg=1;}
-	}
-
-	$x=$_;
-	if($x =~ /\//)
-	{	$x =~ s/^\D*//;
-		$x =~ s/\D*$//;
-		if (! &printfrac($x)) {return 0;}
-		&pusho("of a $unit");
-		$x="";
-		$plural=0;
-	}
-
-	$x =~ s/^\D*([\d,]*)\D*.*$/$1/;		# int part of string
-	if($x ne "") {if (! &printint($x)) {return 0;} }		# print int part (eg. dollars)
-
-	if($next eq "and" && $next2 =~ /\d\/\d/ && next2 !~ /\/.*\//)
-	{	if($unit && $x ne "") {&pusho("and");}	      # frac: eg 4 1/16
-		$z=$next2;
-		$z =~ s/\D*$//;
-		if (! &printfrac($z)) {return 0;}
-		($punct)=($next2 =~ /(\D*)$/);
-		$field+=2;
-		&pusho("${unit}s");
-	
-		if($back) {&perr("money: back and 1 1/3"); return 0;}
-		
-		if($punct) {&appendo($punct);}	# punctuation from *illion
-		return 1;
-	}
-
-	if($back eq "" && $next =~ /^(thousands?|[a-z]*illions?)(\W*)/i)
-	{	if (! &printdecfrac($_)) {return 0;}			# multiplier
-		&pusho($1);
-		$punct=$2;
-		$plural=1;			### if adj '', if noun 's'
-		$field++;
-		$frac=1;
-	}
-	elsif(/\.\d$/ || /\.\d\D/ || /\.\d{3}/ )	# .d or .ddd+
-	{	if (! &printdecfrac($_)) {return 0;}
-		$plural=1;			# can be either
-		$frac=1;
-	}
-	else
-	{	$y=$x;
-		$y =~ s/,//g;			# remove commas
-		if(int($y)!=1) {$plural=1;}
-	}
-
-	if($back eq "" && $input[$field+1] =~ /dollar/i)
-	{	$unit="";			# fix "$1 dollar" wsj typo
-		$subunit_sing="";
-		$subunit_pl="";
-		if (! &printdecfrac($_)) {return 0;}
-		$frac=1;
-	}
-
-#print "f=$front, m=$_, b=$back\n";
-#foo
-	$sing=0;
-	if($last =~ /^\W*[aA][nN]?\W*$/) {$sing=1;}	# a $123, an $80
-	elsif($input[$field+1] eq "-") {$sing=1;}	# eg. $123-a-day
-							# next one is chancy
-	elsif($input[$field] !~ /\W$/ && $input[$field+1] !~ /^\W/ &&
-		$input[$field+1] =~ /[a-zA-Z]$/ && $input[$field+2] eq "-" &&
-		$input[$field+3] =~ /^[a-zA-Z]/) {$sing=1;}	# $ after-tax
-
-	elsif($back eq "" && !$punct) # right contexts with no intervening punct
-	{	$j=$field+1;		# includes *ly as a skip
-		$z="";
-		for($i=0;$i<2;$i++,$j++)	# skip ?
-		{	$y=$input[$j];			# strip final punct
-			$y =~ s/\W*$//;
-			if($y !~ /\w*ly$/i && $sing_dollar{$y}!=2) {last;}
-			($y)=($input[$j] =~ /(\W*)$/);	# get final punct
-			$z .= $y;			# accumulate
-		}
-		$y=$input[$j];			# strip final punct
-		$y =~ s/\W*$//;
-		if($z eq "" && $sing_dollar{$y}==1) {$sing=1;}
-	}
-		
-	if($unit)					# print unit
-	{	&pusho($unit);
-		if($plural && !$sing) {&appendo("s");}	# just add s for plural
-	}
-
-	if(!$frac && /\.\d{2}/)			# .dd	(eg. cents)
-	{	$y=$_;
-		$y =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/;	# get fractional part
-		if($unit && $x ne "") {&pusho("and");}
-		if (! &printint($y)) {return 0;}
-		if($sing || int($y)==1) {&pusho($subunit_sing);}
-		else {&pusho($subunit_pl);}
-	}
-
-	if($back)				# punctuation from this field
-	{	if($punct) {&perr("money: back and punct"); return 0;}
-
-		if($back =~ /^\w/) {&pusho($back);}
-		else {&appendo($back);}
-	}
-		
-	if($punct) {&appendo($punct);}		# punctuation from *illion
-
-  return 1;
-}
-
-sub printyear			# &printyear(x)
-{	if($vflg) {print "printyear: $_[0]\n";}
-	return &printnum($_[0]);		# for now
-}
-
-sub printtime			# &printtime(x)
-{	if($vflg) {print "printtime: $_[0]\n";}
-	$_=$_[0];
-	
-	local(@x);
-	local($front);
-	local($back);
-
-	if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time"); return 0;}
-
-	@x=split(/:/,$_);
-	($front)=($x[0] =~ /^(\D*)/);
-	$x[0] =~ s/^(\D*)//;
-	($back)=($x[1] =~ /(\D*)$/);
-	$x[1] =~ s/(\D*)$//;
-	
-	if($front) 
-	{	&pusho($front);			# generally punctuation
-		if($front !~ /\w$/) {$appendflg=1;}
-	}
-	if (! &printint($x[0])) {return 0;}
-	if($x[1]==0)
-	{	$_=$next;
-		if(!/^[aApP]\.?[nM]\.?$/) {&pusho("o'clock");}
-	}
-	elsif ($x[1]<10)
-	{	&pusho("oh");
-		if (!&printint($x[1])) {return 0;}
-	}
-	else {if (! &printint($x[1])) {return 0;} }
-	if($back)
-	{	if($back =~ /^\w/) {&pusho($back);}
-		else {&appendo($back);}		# generally punctuation
-	}
-  return 1;
-}
-
-sub printfrac
-{	if($vflg) {print "printfrac: $_[0]\n";}
-	local($x)=$_[0];
-
-	local(@z);			#Perl BUG: lists do not seem to be local
-	local($sign);
-	local($front);
-	local($back);
-	local($sign);
-
-	$x =~ s/^([^\d\.]*)//;		# strip front
-	$front=$1;
-	if($front =~ /^\+$/)		# get sign
-	{	$sign="plus";
-		$front =~ s/\+$//;
-	}
-	if($front =~ /^-$/)
-	{	$sign="minus";
-		$front =~ s/-$//;
-	}
-
-	if($x =~ /\D$/)
-	{	($back)=( $x =~ /(\D*)$/ );
-		$x =~ s/\D*$//;			# strip back: final . is punct
-	}
-
-	@z=split(/\//,$x);
-	if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]"); return 0;}
-	if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]"); return 0;}
-
-	if($front) 
-	{	&pusho($front);
-		if($front =~ /[a-zA-Z]$/) {&appendo("-");}
-		$appendflg=1;
-	}
-
-	if($sign) {&pusho($sign);}
-
-	if (! &printint($z[0])) { return 0;}			#numerator
-	if($z[1] <= $#den)			# small den from table (<20)
-	{	&pusho($den[$z[1]]);
-		if($z[0]!=1) {if (! &pluralize) {return 0;} }
-	}
-	else					#large den
-	{	$ones=int($z[1]%100);
-		$hun=100*int($z[1]/100);
-		if($hun>0) {if (!&printint($hun)) {return 0;} }
-		if($ones==0) 
-		{	&appendo("th");
-			if($z[0]!=1) {if (! &pluralize) {return 0;} }
-		}
-		elsif($ones<=$#largeden)		# <20
-		{	&pusho($largeden[$ones]);
-			if($z[0]!=1) {if (!&pluralize) {return 0;} }
-		}
-		else
-		{	$x=int($ones%10);
-			if(int($ones/10))
-			{	&pusho($ten[int($ones/10)]);
-				if($x)
-				{	&appendo("-");	# eg. twenty-five
-					$appendflg=1;
-				}
-			}
-			if($x==0)
-			{	&pusho("th");
-        if($z[0]!=1) {if (! &pluralize) {return 0;} }
-			}
-			else
-			{	&pusho($largeden[$x]);
-        if($z[0]!=1) {if (! &pluralize) {return 0;} }
-			}
-		}
-	}
-
-	if($back)
-	{	$x=&geto;	# in case of 1/10th etc ([stndrth]=st nd rd th)
-		if($back !~ /^[stndrth]{2}/ || $x !~ /$back$/)
-		{	if($back =~ /^[a-zA-Z]/) {&appendo("-");}
-			&appendo($back);
-		}
-	}
-  
-  return 1;
-}
-
-sub printnum			# printnum(n)
-{	if($vflg) {print "printnum: $_[0]\n";}
-	local($x)=$_[0];	# print ordinary numbers
-
-	$leadingzeroflg='';			# global
-	local($front);
-	local($back);
-	local($intpart);
-	local($fracpart);
-	local($hun);
-	local($ones);
-	local($comma);
-	local($sign);
-	local($y);
-
-	$x =~ s/^(\D*)//;		# strip front
-	$front=$1;
-	if($front =~ /^\.$/ || $front =~ /\W\.$/ ||
-		($front =~ /\.$/ && $x =~ /^0/ ))		# leading .
-	{	$front =~ s/\.$//;
-		$x = "." . $x;
-	}
-	if($front =~ /^\+$/)		# get sign
-	{	$sign="plus";
-		$front =~ s/\+$//;
-	}
-	if($front =~ /^-$/)
-	{	$sign="minus";
-		$front =~ s/-$//;
-	}
-
-	if($x =~ /\D$/)
-	{	$back=$x;
-		$back =~ s/^[\d\.,]*\d//;
-		$x =~ s/\D*$//;			# strip back: final . is punct
-	}
-
-	if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number"); return 0;}
-
-	if($x!=0 && $x =~ /^0/ && $x =~ /^\d*$/)	# "oh" numbers
-	{	if($front) 
-		{	&pusho($front);
-			if($front !~ /[a-zA-Z]$/) {$appendflg=1;}
-		}
-
-		if($sign) { &pusho($sign); }
-	
-		while($x ne '')
-		{	$x =~ s/^(.)//;
-			&pusho($ones_oh[$1]);
-		}
-
-		if($back)
-		{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
-			{	if (! &pluralize) {return 0;}			# eg. 1960s
-				$back =~ s/^s//;
-			}
-			if($back)
-			{	if($back =~ /^[a-zA-Z]/) {&pusho($back);}
-				else {&appendo($back);}	# back = punct or "'s"
-			}
-		}
-		return 1;
-	}
-
-	if($x =~ /^\d/)			# get integer part
-	{	if($x =~ /,/)
-		{	$comma=1;
-			$x =~ s/,//g;	# strip commas
-		}
-		$intpart=$x;
-		$intpart =~ s/\..*$//;
-		if($x =~ /^0/) {$leadingzeroflg=1;}
-	}
-
-	if($x =~ /\./)			# get fractional part
-	{	$fracpart=$x;
-		$fracpart =~ s/^.*\././;
-	}
-
-	if($front) 
-	{	&pusho($front);
-		if($front !~ /[a-zA-Z]$/) {$appendflg=1;}
-	}
-
-	if($sign) { &pusho($sign); }
-
-	$ones=int($intpart%100);
-	if($comma) {if (! &printint($intpart)) {return 0;} }
-	elsif(($intpart>=1900 || $intpart>=1100 && $ones==0)
-		&& $intpart<2000 && !$fracpart)			#4 digit -> 2+2
-	{	$hun=int($intpart/100);
-		if (! &printint($hun)) {return 0;}
-		if($ones>=10) {if (! &printint($ones)) {return 0;} }
-		elsif($ones>0)
-		{	&pusho("oh");
-			if (! &printint($ones)) {return 0;}
-		}
-		else {&pusho("hundred");}
-	}
-	else
-	{	if (! &printint($intpart)) {return 0;}
-		$y=$last;
-		$y =~ s/^\W*//;				# thize dates: May 25th
-		if(length($intpart)<=2 && $months{$y})
-		{	if (! &thize("")) {return 0;}
-			$back =~ s/[a-z]//g;
-		}
-	}
-	if($fracpart) {if (! &printdecfrac($fracpart)) {return 0;} }
-
-	if($back)
-	{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
-		{	if (! &pluralize) {return 0;}			# eg. 1960s
-			$back =~ s/^s//;
-		}
-		if($back =~ /^st$/ || $back =~ /^st\W/)	# back= st
-		{	if (! &thize("st")) {return 0;}			# eg. 1st
-			$back =~ s/^st//;
-		}
-		if($back =~ /^nd$/ || $back =~ /^nd\W/)	# back= nd
-		{	if (! &thize("nd")) {return 0;}			# eg. 2nd
-			$back =~ s/^nd//;
-		}
-		if($back =~ /^rd$/ || $back =~ /^rd\W/)	# back= rd
-		{	if (! &thize("rd")) {return 0;}			# eg. 3rd
-			$back =~ s/^rd//;
-		}
-		if($back =~ /^th$/ || $back =~ /^th\W/)	# back= th
-		{	if (! &thize("th")) {return 0;}			# eg. 4th
-			$back =~ s/^th//;
-		}
-		if($back)
-		{	if($back =~ /^[a-zA-Z]/) {&pusho($back);}
-			else {&appendo($back);}	# back = punct or "'s"
-		}
-	}
-  return 1;
-}
-
-sub printdate			# printdate(n):	x/x/x format
-{	if($vflg) {print "printdate: $_[0]\n";}
-	local($x)=$_[0];	# print ordinary numbers
-
-	local(@y);
-	local($front);
-	local($back);
-
-	$x =~ s/^(\D*)//;		# strip front
-	$front=$1;
-
-	$x =~ s/(\D*)$//;		# strip back
-	$back=$1;
-
-	if($x !~ /^\d{1,2}\/\d{1,2}\/(19)?\d{2}$/)
-		{&perr("printdate: $_[0] is not a date"); return 0;}
-
-	@y=split(/\//,$x);
-	$y[2] =~ s/^19(\d{2})$/$1/;
-	
-	if($front) 
-	{	&pusho($front);
-		if($front =~ /[a-zA-Z]$/) {&appendo("-");}
-		$appendflg=1;
-	}
-
-	if (! &printint($y[0])) {return 0;}
-	&appendo("/");
-
-	$appendflg=1;
-	if (! &printint($y[1])) {return 0;}
-	&appendo("/");
-
-	$appendflg=1;
-	if (! &printint($y[2])) {return 0;}
-
-	if($back)
-	{	if($back =~ /^[a-zA-Z]/) {&appendo("-");}
-		&appendo($back);
-	}
-  return 1;
-}
-
-sub printserno			# printserno(n): eg. B1, 3b2, 10W-40
-{	if($vflg) {print "printserno: $_[0]\n";}
-	local($x)=$_[0];	# print mixed sequences of dig and let
-
-	local($y);
-	local($z);
-	local($front);
-	local($back);
-
-	$x =~ s/^(\W*)//;		# strip front
-	$front=$1;
-	if($front) 
-	{	&pusho($front);
-		if($front !~ /[a-zA-Z]$/) {$appendflg=1;}
-	}
-
-	$x =~ s/(\W*)$//;		# strip back
-	$back=$1;
-	$x =~ s/(\d[a-zA-Z]+\d+)(\'?s)$/$1/  # strip "s" or "'s"
-	    && ($back = $2 . $back);
-
-	while($x)
-	{	$x =~ s/^(\D*)//;	# strip off non-dig
-		$y=$1;
-		if($y)
-		{	$y =~ s/-//g;	# remove -
-			if($y eq "") {}
-			elsif($sernowd{$y}) {&pusho($y);}	# word
-			else
-			{	while($y)			# spell out
-				{	if($y =~ /[a-zA-Z]\'s$/)
-					{	&pusho($y);
-						$y =~ s/[a-zA-Z]\'s*$//;
-					}
-					elsif($y =~ /[A-Z]s$/)
-					{	&pusho($y);
-						$y =~ s/[A-Z]s$//;
-					}
-					else
-					{	$y =~ s/^(.\.?)//;
-						&pusho($1);
-					}
-				}
-			}
-		}		     # (should expand here unless in dictionary)
-		$x =~ s/^(\d*)//;	# strip off dig
-		$y=$1;
-		if($y ne "") { if (! &printdigstr($y)) {return 0;} }
-	}
-
-	if($back =~ /^s\b/)	# back = s
-	{			# eg. 2C60s
-	    if (! &pluralize) {return 0;} 
-	    $back =~ s/^s//;
-	}
-	if($back)
-	{	if($back =~ /^\w/) {&pusho($back);}
-		else {&appendo($back);}
-	}
-	$appendflg=0;
-  return 1;
-}
-
-sub printdigstr			# printdigstr(x)
-{	if($vflg) {print "printdigstr: $_[0]\n";}
-	local($x)=$_[0];
-
-	local(@y);
-	local($j);
-	local($k);
-
-	if($x =~ /^0/)			# leading zero
-	{	while($x ne "")
-		{	$x =~ s/^(.)//;
-			if($1 !~ /\d/) {&perr("printdigstr: non-digit"); return 0;}
-			&pusho("$ones_z[$1]");
-		}
-		return;
-	}
-	if($x =~ /^\d0*$/)		# d, d0, d00, d000, etc
-	{	return &printint($x);
-	}
-
-	$_=$x;
-	@y=();
-	for($j=0;$_ ne "";$j++) { $y[$j]=chop($_); }	# j=no digits
-	for($k=0;$y[$k]==0;$k++) {}			# k= nr following 0s
-
-	if($j==2)			# 2 dig
-	{	return &printint($x);
-	}
-	if($j==3)
-	{	if (! &printint($y[2])) {return 0;}
-		if($y[1]==0) {&pusho("oh");}
-		return &printint("$y[1]$y[0]");
-	}
-	if($j==5 && $k<=2)
-	{	if (! &printint("$y[4]")) {return 0;}
-		$j=4;
-	}
-	if($j==4)
-	{	if (! &printint("$y[3]$y[2]")) {return 0;}
-		if($k==2) {&pusho("hundred");}
-		else
-		{	if($y[1]==0) {&pusho("oh");}
-			return &printint("$y[1]$y[0]");
-		}
-		return 1;
-	}
-						# >5 dig: just sequential dig
-	for($j--;$j>=0;$j--) {&pusho("$ones_oh[$y[$j]]");}
-  return 1;
-}
-
-sub printftin			# printftin(n): eg. 6\'-4\"
-{	if($vflg) {print "printftin: $_[0]\n";}
-	local($x)=$_[0];	# print mixed sequences of dig and let
-
-	local($y);
-	local($front);
-	local($back);
-
-	$x =~ s/^(\D*)//;		# strip front
-	$front=$1;
-
-	$x =~ s/(\D*)$//;		# strip back
-	$back=$1;
-	$back =~ s/^\"//;		# remove \"
-
-	if($front) 
-	{	&pusho($front);
-		if($front !~ /[a-zA-Z]$/) {$appendflg=1;}
-	}
-
-	$x =~ s/^([\d\.]*)//;	# strip off dig & .
-	$y=$1;
-	if(!$y) {&perr("printftin: bad feet"); return 0;}
-	if (! &printnum($y)) {return 0;}
-	if($y==1) {&appendo("-foot");}
-	else {&appendo("-feet");}
-
-	$x =~ s/^\'//;	# strip off \'
-	$x =~ s/^-//;	# strip off -
-	if(!$x) {&perr("printftin: bad intermed"); return 0;}
-
-	$x =~ s/^([\d\.]*)//;	# strip off dig & .
-	$y=$1;
-	if(!$y) {&perr("printftin: bad inches"); return 0;}
-	if (! &printnum($y)) {return 0;}
-	if($y==1) {&appendo("-inch");}
-	else {&appendo("-inches");}
-
-	if($back)
-	{	if($back !~ /^[a-zA-Z]/) {&appendo($back);}
-		else {&pusho($back);}
-	}
-  return 1;
-}
-
-sub printint			# printint(x)
-{	if($vflg) {print "printint: $_[0]\n";}
-	local($x)=$_[0];
-
-	local($comma);
-	local($leading_zero);
-	local($fractional);
-	local(@y);
-	
-	$fractional=$x =~ /\.\d/;
-	$x =~ s/^\D*([\d,]*)\D*.*$/$1/;	# int part of string
-	$leading_zero=$x =~ /^0/;
-	$comma=$x =~ /,/;
-	$x =~ s/,//g;
-	if($x eq "") {return;}
-
-	if($x == 0)
-	{	&pusho("zero");
-		$leadingzeroflg=1;
-		return;
-	}
-	
-	@y=();
-	for($j=0;$x;$j++) { $y[$j]=chop($x); }
-
-	if($comma || $fractional || 1)
-	{	for($j=3*int($#y/3);$j>=0;$j-=3)
-		{	if($y[$j+2]) { &pusho("$ones_z[$y[$j+2]] hundred");}
-			if($y[$j+1]==1) { &pusho($teen[$y[$j]]);}
-			else
-			{	if($y[$j+1]>1)
-				{	&pusho($ten[$y[$j+1]]);
-					if($y[$j])
-					{	&appendo("-");	# twenty-five
-						$appendflg=1;
-					}
-				}
-				if($y[$j]>0) { &pusho($ones_z[$y[$j]]);}
-			}
-			if(int($j/3)>0)
-			{	if(int($j/3) > $#mult)
-					{ &perr("printint: too big"); return 0;}
-				&pusho($mult[int($j/3)]);
-			}
-			$commanextflg=1;
-		}
-	}
-	$commanextflg=0;
-  return 1;
-}
-
-sub printdecfrac
-{	if($vflg) {print "printdecfrac: $_[0]\n";}
-	local($x)=@_[0];
-	
-	if($x !~ /\.\d/) {return;}
-	$x =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/;		# get fractional part
-
-	&pusho($POINT);
-	@y=split(//,$x);
-	if($leadingzeroflg)
-		{for($j=0;$j<=$#y;$j++) { &pusho($ones_z[$y[$j]]);}}
-	else {for($j=0;$j<=$#y;$j++) { &pusho($ones_oh[$y[$j]]);}}
-
-  return 1;
-}
-
-sub pluralize		# pluralize(): pluralize last entry on output stack
-{	if($vflg) {print "pluralize: $_[0]\n";}
-	local($x);
-
-	$_=&geto;
-	if( /st$/ || /nd$/ || /rd$/ || /th$/ || /quarter$/ || /zero$/ || /oh/ ||
-		/one$/ || /two$/ || /three$/ || /four$/ || /five$/ ||
-		/seven$/ || /eight$/ || /nine$/ ||
-		/ten$/ || /eleven$/ || /twelve$/ || /een$/ ||
-		/hundred$/ || /thousand$/ || /illion$/ )
-	{	&appendo("s");
-	}
-	elsif (/six$/)
-	{	&appendo("es");
-	}
-	elsif (/half$/)
-	{	$x=&popo();
-		$x =~ s/f$/ves/;
-		&pusho($x);
-	}
-	elsif (/ty$/)			# fifty etc.
-	{	$x=&popo();
-		$x =~ s/y$/ies/;
-		&pusho($x);
-	}
-	else {&perr("pluralize: unknown word: $_"); return 0;}
-
-  return 1;
-}
-
-sub thize		# thize(): add th to last entry on output stack
-{	if($vflg) {print "printthize: $_[0]\n";}
-	local($y)=$_[0];
-
-	local($x);
-
-	$_=&geto;
-	if( /four$/ || /six$/ || /seven$/ || /ten$/ ||
-		/eleven$/ || /een$/ || /hundred$/ || /thousand$/ || /illion$/ )
-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # xth
-		&appendo("th");
-	}
-	elsif( /one$/ )						# 1st
-	{	if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n"); return 0;}
-		$x=&popo();
-		$x =~ s/one$/first/;
-		&pusho($x);
-	}
-	elsif( /two$/ )						# 2nd
-	{	if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
-		$x=&popo();
-		$x =~ s/two$/second/;
-		&pusho($x);
-	}
-	elsif( /three$/ )					# 3rd
-	{	if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
-		$x=&popo();
-		$x =~ s/three$/third/;
-		&pusho($x);
-	}
-	elsif( /five$/ || /twelve$/ )				# 5th, 12th
-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
-		$x=&popo();
-		$x =~ s/ve$/fth/;
-		&pusho($x);
-	}
-	elsif(/eight$/)
-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # 8th
-		&appendo("h");
-	}
-	elsif( /nine$/ )
-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
-		$x=&popo();
-		$x =~ s/nine$/ninth/;
-		&pusho($x);
-	}
-	elsif( /ty$/ )
-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
-		$x=&popo();
-		$x =~ s/ty$/tieth/;
-		&pusho($x);
-	}
-	else {&perr("thize: unknown word: $_"); return 0;j}
-  return 1;
-}
-
-sub pusho				# pusho($x): push output
-{	if($commanextflg)		# global: used for commas in printint
-	{	$commanextflg=0;		
-		&appendo(",");
-	}
-	if($appendflg)			# global: used for fronts
-	{	$appendflg=0;		
-		&appendo(@_[0]);
-	}
-	else {push(@output,@_);}
-}
-
-sub appendo				# appendo($x): append to output
-{	$appendflg=0;		
-#	if($#output < 0) {&pusho("");}
-	if($#output < 0) {&perr("appendo: output empty"); return 0;}
-	$output[$#output] .= @_[0];
-}
-
-sub popo				# popo(): pop last output
-{	if($#output < 0) {&perr("popo: output empty"); return 0;}
-	pop(@output);
-}
-
-sub geto				# geto(): get last output
-{	if($#output < 0) {&perr("geto: output empty"); return 0;}
-	return $output[$#output];
-}
-
-sub perr
-{	print STDERR "numproc: $_[0]\n";
-	print STDERR "line number=$.: fields=$last, $this, $next\n";
-#	exit(1);
-
-	$appendflg=0;
-	$commanextflg=0;
-	&pusho($this);
-}
-
-sub perr2
-{	print STDERR "numproc: $_[0]\n";
-	exit(1);
-}
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
deleted file mode 100755
index 6caf474e3af..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/perl
-
-# $Id: pare-sgml.perl,v 1.3 1996/08/15 02:51:17 robertm Rel $
-# removes extraneous headers and other non-LM fields
-# translates <DOC ...> into LM-standard <art ...>
-# removes comments (enclosed in brackets)
-
-use strict;
-use warnings;
-
-my $intext=0;
-while (<>)
-{
-    if ($intext == 0)
-    {
-	print if (s=<DOC=<art=);
-	print if (s=</DOC=</art=);
-	$intext = 1 if /^<TEXT>/;
-	next;
-    }
-    if (/^<\/TEXT>/)
-    {
-	$intext = 0;
-	next;
-    }
-    next if /^<comment>/;
-    next if /^<speaker>/;
-
-    s/\[+[^\[\]]*\]+//g;
-    if (/[\[\]]/)
-    {
-	warn "pare-sgml: warning - unbalanced comment brackets at $ARGV line $.\n";
-	print STDERR " line=$_";
-    }
-    print;
-}
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/progsummary.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/progsummary.perl
deleted file mode 100755
index 891e26d5650..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/progsummary.perl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/perl
-
-# Program:	progsummary.perl
-# Written by:	dave graff
-# Usage:	[file.list]
-# Purpose:	extracts program information from sgml-ized PSM texts
-
-$degbug = 0;
-if ( $ARGV[0] eq "-d" ) {
-    $debug = 1;
-    shift;
-}
-
-while (<>)
-{
-    chop;
-    open( INP, "<$_" );
-    $progdate = $progid = "unknown";
-    while (<INP>) {
-	if ( /^<program>/ ) {
-	    $_ = <INP>;
-	    print STDERR if ( $debug );
-	    $netwrk = substr( $_, 0, 3 );
-	    $rest = substr( $_, 3 );
-	    if ( $rest =~ /^(20\/20)/ ) {
-		$progid = $1;
-	    }
-	    elsif ( $rest =~ /^([A-Z a-z\&]+)/ ) {
-		$progid = $1;
-	    }
-	}
-	elsif ( /^<summary>/ ) {
-	    $_ = <INP>;
-	    print STDERR "$_===\n" if ( $debug );
-	    if ( /\d+\\(\d{6})\\\d+/ ) {
-		$progdate = $1;
-	    }
-	}
-	elsif ( /^<\/art>/ ) {
-	    print "$netwrk\t$progdate\t\"$progid\"\n";
-	}
-    }
-    close INP;
-}
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/puncproc.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/puncproc.perl
deleted file mode 100755
index a6e1f19ba56..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/puncproc.perl
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/usr/bin/perl
-
-# $Id: puncproc.perl,v 1.2 1996/08/05 16:12:42 robertm Rel $
-###############################################################################
-# This software is being provided to you, the LICENSEE, by the Massachusetts  #
-# Institute of Technology (M.I.T.) under the following license.  By           #
-# obtaining, using and/or copying this software, you agree that you have      #
-# read, understood, and will comply with these terms and conditions:          #
-#                                                                             #
-# Permission to use, copy, modify and distribute, including the right to      #
-# grant others the right to distribute at any tier, this software and its     #
-# documentation for any purpose and without fee or royalty is hereby granted, #
-# provided that you agree to comply with the following copyright notice and   #
-# statements, including the disclaimer, and that the same appear on ALL       #
-# copies of the software and documentation, including modifications that you  #
-# make for internal use or for distribution:                                  #
-#                                                                             #
-# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
-# reserved.                                                                   #
-#                                                                             #
-# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
-# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
-# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
-# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
-# TRADEMARKS OR OTHER RIGHTS.                                                 #
-#                                                                             #
-# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
-# used in advertising or publicity pertaining to distribution of the          #
-# software.  Title to copyright in this software and any associated           #
-# documentation shall at all times remain with M.I.T., and USER agrees to     #
-# preserve same.                                                              #
-###############################################################################
-
-# punctuation preprocessor for WSJ 
-# assumes 1 sentence per line
-# places spaces around punctuation and translates to IBM-like notation
-#
-# punctproc -np removes punctuation
-#
-# NOTE: wsj89 starts single quotes with ` or '
-#
-
-for($i=0,$j=0;$i<=$#ARGV;$i++)
-{	if($ARGV[$i] =~ /^-/)
-	{	if($ARGV[$i] =~ /^-np$/) {$npflg=1;}
-		else {&perr2("illegal flag: $ARGV[$i]");}
-	}
-	else { &perr2("no file args"); }
-}
-@ARGV=();
-
-while(<>)
-{	s/^/ /;
-	s/\n$/ /;
-
-	next if (/<\/?[spa]/);	# protect sgml
-
-						# forbidden symbols
-	if(/</) {&perr("<");}				# <
-	if(/>/) {&perr(">");}				# >
-	if(/\$/) {&perr("\$");}				# $
-	if(/_/) {&perr("_");}				# _
-	if(/\d/) {&perr("[0-9]");}			# 0-9
-
-					# protect contractions with _
-	s/([a-zA-Z]in')([^a-zA-Z])/$1_ $2/g; 	# *in'	e.g. Dunkin', singin'
-						# Rock 'n' Roll
-	s/(\W)['`]([nN])(\W)/$1 _'$2$3/g;	# [`'][nN] ->  _'[nN]
-	s/(\W)([nN]')(\W)/$1$2_ $3/g;		# [nN]'
-	s/(\W)('[eE]m)(\W)/$1_$2$3/g;		# '[eE]m
-	s/(\W)[`'"]R\.?['"](\W)/$1 _"R."_ $3/g;	# toys "R" us
-	s/(\W)(Cos.')(\W)/$1$2_ $3/g;		# Cos.'	(companies')
-	s/(\W)(de.')(\W)/$1$2_ $3/g;		# de'	 Imelda de' Lambertazzi
-	s/(\W)(Bros.')(\W)/$1$2_ $3/g;		# Bros.'
-	s/(\W)(o')(\W)/$1$2_ $3/g;		# o'	 Man o' War
-	s/(\W)(ol')(\W)/$1$2_ $3/g;		# ol'	 old
-	s/(\W)maitre *d'(\W)/$1maitre_d'_ $2/g;	# maitre d'
-	s/(\W)maitres *d'(\W)/$1maitres_d'_ $2/g; # maitres d'
-	s/(\W)('neath)(\W)/$1 _$2$3/g;		# 'neath	 beneath
-	s/(\W)('Wadoo)(\W)/$1 _$2$3/g;
-			# 'Wadoo   'Wadoo , zim bam , boodleoo , hoodle ahdawam
-	s/(\W)('cause)(\W)/$1 _$2$3/g;		 # 'cause	because
-	s/(\W)('burbs)(\W)/$1 _$2$3/g;		# 'burbs	suburbs
-	s/(\W)('[nN]uf)(\W)/$1 _$2$3/g;		# 'Nuf	enough
-	s/(\W)('til)(\W)/$1 _$2$3/g;		# 'til
-
-
-	s/([^\w\.\'\`_ -])/ $1 /g;		# SP around most punct 
-						# but not .'`\_-
-
-	if(!$npflg)
-	{	s/ty-(one)/ty $1/g;		# rm - from twenty-one
-		s/ty-(first)/ty $1/g;		# rm - from twenty-first
-		s/ty-(two)/ty $1/g;		# rm - from twenty-two
-		s/ty-(second)/ty $1/g;		# rm - from twenty-second
-		s/ty-(three)/ty $1/g;		# rm - from twenty-three
-		s/ty-(third)/ty $1/g;		# rm - from twenty-third
-		s/ty-(four)/ty $1/g;		# rm - from twenty-four
-		s/ty-(five)/ty $1/g;		# rm - from twenty-five
-		s/ty-(six)/ty $1/g;		# rm - from twenty-six
-		s/ty-(seven)/ty $1/g;		# rm - from twenty-seven
-		s/ty-(eight)/ty $1/g;		# rm - from twenty-eight
-		s/ty-(nin)/ty $1/g;		# rm - from twenty-nine{th}
-	}
-	#s/([^-])-([^-])/$1 - $2/g;		# -
-	#s/([^-])-([^-])/$1 - $2/g;		# -
-
-	s/([^\.]) *\. *\. *\. *\. *([^\.])/$1 _..._ . $2/g;	# x ... .
-	s/([^\.]) *\. *\. *\. *([^\.])/$1 _..._ $2/g;	# x ...  
-
-	s/([^\w'\.][b-zB-HJ-Z]\.)([^\.\w]*)$/$1 .$2/;	# eg. S. at end -> S. .
-	s/(\s[a-z]\.\s[a-z]\.)([^\.\w]*)$/$1 .$2/i; #eg. S. I. at end -> S. I. .
-	s/(\WMr\.)(\W*)$/$1 . $2/i;		# Mr. at end -> Mr. .
-	s/(\WMrs\.)(\W*)$/$1 . $2/i;		# Mrs. at end -> Mrs. .
-	s/(\WMs\.)(\W*)$/$1 . $2/i;		# Ms. at end -> Ms. .
-	s/(\WMessrs\.)(\W*)$/$1 . $2/i;		# Messrs. at end -> Messrs. .
-
-	s/\.([^.\w]*)$/ . $1/;			# SP around . at end of sent
-
-	s/([^\w\.])['`]([a-zA-Z]*)'(\W)/$1 ' $2 ' $3/g;	# `word'   
-	s/([^\w\.])['`]([a-zA-Z])/$1 ' $2/g;			# 'word
-	s/([^sS])' /$1 ' /g;			# non plural-possessives
-
-	s/([^_])`/$1 ` /g;			# SP around ` (should not need)
-	s/`/'/g;				# ` -> '      (should not need)
-
-	s/_/ /g;				# clear _
-
-	if(!$npflg)
-	{	s/ , / ,COMMA /g;			# map punct to words
-		s/ \? / ?QUESTION-MARK /g;
-		s/ : / :COLON /g;
-		s/ # / #SHARP-SIGN /g;
-		s/ @ / @AT-SIGN /g;
-		s/ ' / 'SINGLE-QUOTE /g;
-		s/ " / "DOUBLE-QUOTE /g;
-		s/ ; / ;SEMI-COLON /g;
-		s/ ! / !EXCLAMATION-POINT /g;
-		s/ & / &AMPERSAND /g;
-		s/ \+ / +PLUS /g;
-		s/ \{ / {LEFT-BRACE /g;
-		s/ \} / }RIGHT-BRACE /g;
-		s/ \( / (LEFT-PAREN /g;
-		s/ \) / )RIGHT-PAREN /g;
-		s/ \. / .PERIOD /g;
-		s/ \.{3} / ...ELLIPSIS /g;
-		s/ -- / --DASH /g;
-		# s/ - / -HYPHEN /g;
-		s/ = / =EQUALS /g;
-		s/ % / %PERCENT /g;
-		s/ \/ / \/SLASH /g;
-		s/ ([b-zB-HJ-Z]) / $1. /g;   # restore . removed by elipsis err
-	}
-	else
-	{	s/ , / /g;			# map punct to words
-		s/ \? / /g;
-		s/ : / /g;
-		s/ # / /g;
-		s/ @ / at /g;
-		s/ ' / /g;
-		s/ " / /g;
-		s/ ; / /g;
-		s/ ! / /g;
-		s/ & / and /g;
-		s/ \+ / plus /g;
-		s/ \{ / /g;
-		s/ \} / /g;
-		s/ \( / /g;
-		s/ \) / /g;
-		s/ \. / /g;
-		s/ \.{3} / /g;
-		s/ -- / /g;
-		s/ ?- ?/ /g;
-		s/ = / equals /g;
-		s/ % / percent /g;
-		s/ \/ / slash /g;
-		s/\.POINT/point/g;
-	}
-} continue {
-	# this block is executed even if we use "next"
-	s/ {2,}/ /g;
-	s/^ //;
-	s/ $//;
-	if($_) {print "$_\n";}
-}
-
-sub perr		#perr(error,line);
-{	print STDERR "punctproc: line no=$.: $_[0]\n";
-	print STDERR "line=$_\n";
-}
-
-sub perr2
-{	print STDERR "num: $_[0]\n";
-	exit(1);
-}
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sent-init.vocab b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
deleted file mode 100644
index 375f5ddf99b..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sent-init.vocab
+++ /dev/null
@@ -1,411 +0,0 @@
-A
-ABORTION
-ABOUT
-ACCORDING
-ACCORDINGLY
-ACTORS
-ADDED
-ADDEDON
-ADDING
-ADDS
-ADJUSTED
-ADMITTEDLY
-ADVISERS
-ADVISORY
-AFFLICTED
-AFTER
-AFTERTHOUGHTS
-AGAIN
-ALL
-ALLIANCES
-ALLOCATE
-ALLTIME
-ALMOST
-ALONG
-ALSO
-ALTHOUGH
-ALTOGETHER
-AMID
-AMONG
-AN
-AND
-ANOTHER
-ANY
-APPROVED
-ARCHRIVAL
-ARE
-ARRIVING
-AS
-ASIDE
-ASKED
-ASSUMPTIONS
-AT
-AUGUST
-AVAILABLE
-BACKERS
-BANKERS
-BARRING
-BASED
-BECAUSE
-BEFORE
-BEGINNING
-BEHIND
-BEING
-BESIDES
-BEYOND
-BIG
-BOTH
-BROADLY
-BURNING
-BUT
-BY
-CALLABLE
-CAN
-CEASES
-CHALLENGES
-CHANCES
-CHANGING
-CHARGED
-CHARGES
-CLASSES
-CLEANUPS
-CLEARLY
-COMMUNIST
-COMPETITORS
-COMPLEMENTARY
-COMPLETION
-CONSEQUENTLY
-CONSIDER
-CONSISTING
-CONVERSELY
-CONVICTED
-COULD
-COUNTING
-CURFEWS
-CURRENT
-CURRENTLY
-CUSTOMER
-DEATH
-DECEMBER
-DEFENDERS
-DESCRIBED
-DETAILS
-DIVERSITY
-DO
-DRACONIAN
-DRAFTERS
-DUMPING
-EACH
-EARLIER
-EDUCATIONAL
-EIGHT
-EMBARGO
-EUROPES
-EVEN
-EVENTUALLY
-EVER
-EVERY
-EVERYBODYS
-EVERYONE
-EXAMPLE
-EXCEPT
-EXCLUDING
-EXHAUSTED
-EXPECT
-EXPECTED
-FAR
-FARMERS
-FATAL
-FEW
-FIRST
-FIXED
-FLOATING
-FOLKS
-FOR
-FORMER
-FROM
-FURTHER
-FURTHERMORE
-GIVEN
-HAVE
-HAVING
-HE
-HEADING
-HELPING
-HER
-HERE
-HERES
-HES
-HIGHER
-HIS
-HOLDERS
-HOLDING
-HOW
-HOWEVER
-I
-IF
-ILLEGAL
-IM
-IMPOSED
-IMPROVEMENT
-IN
-INCLUDING
-INCREASINGLY
-INDEED
-INDEPENDENT
-INDICTMENTS
-INFORMING
-INITIAL
-INSTEAD
-INSURERS
-INTENDS
-INTERESTINGLY
-INTRODUCED
-IS
-IT
-ITS
-IVE
-JANUARY
-JUMPS
-JUST
-KNOWN
-LAST
-LATE
-LATER
-LEGALLY
-LESS
-LET
-LIKE
-LIKEWISE
-LIMITS
-LOCATED
-LONGTERM
-LONGTIME
-LOOKING
-LOOKS
-LOSING
-MADE
-MANY
-MARITAL
-MAY
-MAYBE
-MEANWHILE
-MEETING
-MINIMUM
-MONTHLY
-MORE
-MOREOVER
-MOST
-MOSTLY
-MOUNTED
-MR
-MUCH
-MY
-NAMED
-NATURAL
-NATURALLY
-NEARLY
-NEGOTIATORS
-NEITHER
-NEVER
-NEXT
-NINETYDAY
-NOBODY
-NONE
-NONETHELESS
-NOR
-NOT
-NOTABLY
-NOTES
-NOTHING
-NOTING
-NOW
-NOWADAYS
-OBVIOUSLY
-OCCUPATIONAL
-OCTOBER
-OF
-OFFERED
-OFTEN
-ON
-ONCE
-ONE
-ONEYEAR
-ONLY
-OPERATING
-OPINION
-OPPOSITION
-OR
-OTHER
-OTHERS
-OTHERWISE
-OUR
-OUTSIDE
-OVER
-PARENTS
-PART
-PARTICIPATION
-PAYMENT
-PEOPLE
-PESSIMISTS
-PLANTS
-PLEDGED
-PLURALISTIC
-POINTING
-POLICY
-POLITICAL
-POSITIVE
-POSTPONED
-POTENTIAL
-PRESENCE
-PRESSURED
-PREVIOUSLY
-PRODUCERS
-PROFIT
-PROTECTING
-PROTECTIONISM
-PROVISIONAL
-PURELY
-PUT
-QUICK
-QUITE
-RATHER
-REACHED
-READIED
-RECENTLY
-RECOGNITION
-RECOVERIES
-REDEMPTION
-REFERRING
-RELYING
-REMAINING
-REMOVING
-REOFFER
-REPRESENTING
-REQUEST
-RESEARCHERS
-RESULTS
-REVIEWED
-RIOTS
-RIVAL
-RUMORS
-RUSSIAS
-SAYS
-SCORING
-SECRETARIES
-SECTION
-SEEKING
-SELFDEFENSE
-SENDS
-SEPARATELY
-SEPTEMBER
-SEVERAL
-SEXUAL
-SHE
-SHELTER
-SHES
-SHOPKEEPERS
-SHORTLY
-SHOULD
-SIMILARLY
-SINCE
-SLIGHTLY
-SMALL
-SMALLER
-SO
-SOLDIERS
-SOME
-SOON
-SORRY
-SOUGHT
-STEPPED
-STILL
-STUDIES
-SUBSCRIBERS
-SUBSTANTIAL
-SUCH
-SUPPORT
-SURELY
-SWEETHEART
-TALKS
-TAXPAYERS
-THAT
-THATS
-THE
-THEIR
-THEN
-THERE
-THEREAFTER
-THEREFORE
-THERES
-THESE
-THEY
-THEYLL
-THEYRE
-THIS
-THOSE
-THOUGH
-THREATENED
-THROUGH
-THROUGHOUT
-THURSDAY
-THUS
-TO
-TODAY
-TOGETHER
-TONIGHT
-TOO
-TRADITIONALLY
-TRANSFERRED
-TROTTING
-TRUTH
-TUMBLES
-TWOFIFTHS
-TWOTHIRDS
-UNDER
-UNFORTUNATELY
-UNINSURED
-UNLESS
-UNLIKE
-UNTIL
-UPON
-URGED
-USERS
-USING
-USUALLY
-VENTURE
-VERSION
-VIRTUALLY
-WAS
-WATCHING
-WE
-WEDNESDAY
-WEIGHED
-WELCOME
-WELL
-WERE
-WEVE
-WHAT
-WHATS
-WHEN
-WHENEVER
-WHETHER
-WHICHEVER
-WHILE
-WHOEVER
-WHY
-WITH
-WONDERS
-WORST
-WOULD
-WRITTEN
-YEARS
-YES
-YESTERDAY
-YESTERDAYS
-YET
-YOU
-YOULL
-YOUR
-YOURE
-YOUVE
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sentag.c b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sentag.c
deleted file mode 100644
index af70504d1f1..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/sentag.c
+++ /dev/null
@@ -1,674 +0,0 @@
-static char rcsid[] = "$Id: sentag.c,v 1.9 1996/08/13 15:57:35 robertm Rel $";
-/*************************************************************
- * sentag.c
- *------------------------------------------------------------
- * Intended to do the best possible sentence tagging of
- * text data from journalistic sources.  Input format is
- * the typical TIPSTER-style SGML, in which the critical
- * tags required are indicated below, and other tags are
- * passed through without modifications:
- *
- *	<DOC id=artid-string>
- *	...
- *	<TEXT>
- *	<p>
- *	All text should be prepared with one paragraph on a line, regardless \
- *	how long it is (up to 65536 chars).
- *	<p>
- *	The sentag program will make changes within the "TEXT" region only. This \
- *	is an example.
- *	<p>
- *	In addition to putting one whole paragraph on one line, other cleaning up \
- *	may be needed so that output sentences are tidy. This might include removing \
- *	"datelines", etc.
- *	<p>
- *	Note that closing tags are implicit for paragraphs. The same will apply to \
- *	sentence tags in the output.
- *	</TEXT>
- *	...
- *	</DOC>
- *
- * Output format is:
- *
- *	<DOC id=artid-string>
- *	...
- *	<TEXT>
- *	<p id=artid-string.1>
- *	<s>
- *	All text should be prepared with one paragraph on a line, regardless \
- *	how long it is (up to 65536 chars).
- *	<p id=artid-string.2>
- *	<s>
- *	The sentag program will make changes within the "TEXT" region only.
- *	<s>
- *	This is an example.
- *	<p id=artid-string.3>
- *	<s>
- *	In addition to putting one whole paragraph on one line, other cleaning up \
- *	may be needed so that output sentences are tidy.
- *	<s>
- *	This might include removing "datelines", etc.
- *	<p id=artid-string.4>
- *	<s>
- *	Note that closing tags are implicit for paragraphs.
- *	<s>
- *	The same will apply to sentence tags in the output.
- *	</TEXT>
- *	...
- *	</DOC>
- *
- * In a nutshell, this program applies unique ID strings to all
- * paragraph tags, inserts an initial <s> tag at the start of each
- * paragraph, and for each period "." character that marks the end of
- * a sentence within a paragraph, it replaces the following space with
- * "\n<s>\n".
- *
- * This program operates as a pipeline filter.
- *
- * By default, it looks in "./addressforms" for a list of
- * sentence-internal abbreviations, and in "./sent-init.vocab" for a
- * list of words that would only be capitalized at the beginning of a
- * sentence.  The arguments "-a abbrevfile" and "-i sent-init.list"
- * can override the defaults.
- *
- * If either "abbrev" or "sent-init" file is not found, the program exits.
- *
- * A "sent-init.candidate" file is created, containing all the cases
- * in which a capitalized word following a period has been _assumed_
- * to be a continuation of an abbreviated proper noun phrase
- * (e.g. U.S. Treasury).  This "candidate" file (and a histogram of
- * its tokens) should be reviewed to look for (classes of) possible
- * missed boundaries.  Sentence breaks are NOT applied to these cases,
- * and a second pass over the same input data should be made if the
- * "sent-init" file is updated to include any of these candidates.
- * The argument "-t candidate.file" will override the default name.
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <search.h>
-
-#define BUFSIZE 65536
-#define MAXABRV 2048
-#define MAXIVCB 1024
-#define MAXBRKS 256
-#define IDLEN   64
-#define MAXSENTLEN 4096
-
-char *abbrevs[MAXABRV];		/* contains sentence-internal abbrevs */
-char idstr[IDLEN];
-struct si_word {
-    char *wd;
-} si_node, s_init_wd[MAXIVCB];	/* contains non-capitalized words */
-
-int n_abbrevs = 0;
-int n_mid_abbrevs, n_s_init = 0, pid;
-
-FILE *tfp;
-
-/* --------------------------------------------------
- * w_compare() : comparison function for bsearch()
- */
-int w_compare( w1, w2 )
-  struct si_word *w1, *w2;
-{
-    return strcmp( w1->wd, w2->wd );
-}
-
-
-main( ac, av )
-  int ac;
-  char **av;
-{
-    FILE *afp, *ifp;
-    int c, i, j, inText;
-    char buf[BUFSIZE], *cp;
-    extern int optind, opterr;
-    extern char *optarg;
-    int w_compare();
-
-/* Handle options or defaults
- */
-    afp = ifp = tfp = NULL;
-    while (( c = getopt( ac, av, "a:i:t:" )) != -1 )
-        switch ( c )
-        {
-	  case 'a':
-	    if (( afp = fopen( optarg, "r" )) == NULL ) {
-		fprintf( stderr, "Unable to open abbrev file %s\n", optarg );
-		exit(1);
-	    }
-	    break;
-	  case 'i':
-	    if (( ifp = fopen( optarg, "r" )) == NULL ) {
-		fprintf( stderr, "Sent-init.vocab file %s not found.\n", optarg );
-		exit(1);
-	    }
-	    break;
-	  case 't':
-	    if (( tfp = fopen( optarg, "w" )) == NULL ) {
-		fprintf( stderr, "Can't create %s -- quitting.\n", optarg );
-		exit(1);
-	    }
-	    break;
-	  default:
-	    fprintf( stderr, "Usage: %s [-a abbrevs] [-i sent-init.vocab]\n", av[0] );
-	    fprintf( stderr, "version: %s\n", rcsid );
-	    exit(1);
-	}
-
-/* Always create a table of uncertain capitalized words
- */
-    if ( ! tfp && ( tfp = fopen( "sent-init.candidate", "a" )) == NULL ) {
-	fprintf( stderr, "Can't create/append-to ./sent-init.candidate\n" );
-	exit(1);
-    }
-
-/* Load typical sentence-initial words (capitalized only when sentence-intial)
- * -- input list file must be presorted alphabetically
- */
-    if ( ! ifp && ( ifp = fopen( "sent-init.vocab", "r" )) == NULL ) {
-	fprintf( stderr, "File ./sent-init.vocab not found.\n" );
-	exit(1);
-    }
-    while ( n_s_init < MAXIVCB && fgets( buf, BUFSIZE, ifp ) != NULL )
-	if ( buf[0] != '#' )
-	    s_init_wd[ n_s_init++ ].wd = strdup( strtok( buf, "\n" ));
-    fclose( ifp );
-
-/* Load definite within-sentence abbrevs
- */
-    if ( ! afp && ( afp = fopen( "addressforms", "r" )) == NULL ) {
-	fprintf( stderr, "Unable to open file ./addressforms\n" );
-	exit(1);
-    }
-    while ( n_abbrevs < MAXABRV && fgets( buf, BUFSIZE, afp ) != NULL )
-	if ( buf[0] != '#' )
-	    abbrevs[ n_abbrevs++ ] = strdup( strtok( buf, "." ));
-    fclose( afp );
-    n_mid_abbrevs = n_abbrevs;
-
-/* Add some special abbrevs to the list
- */
-    abbrevs[ n_abbrevs++ ] = strdup( "Dr" );
-    abbrevs[ n_abbrevs++ ] = strdup( "St" );
-
-/* Scan and tag text data
- */
-    inText = 0;
-    *idstr = 0;
-    while ( gets( buf ))
-    {
-	if (strlen(buf) > BUFSIZE)
-	  {
-	    fprintf( stderr, "input buffer size exceeded!!\n" );
-	    fprintf( stderr, "last input:\n%s\n", buf );
-	    exit(-1);
-	  }
-	if ( !inText ) {
-	    if ( buf[0] == '<' )
-		switch ( buf[1] )
-		{
-		  case 'D':
-		    if ( !strncmp( buf, "<DOC id=", 8 )) {
-			strcpy( idstr, &buf[8] );
-			if (( cp = strchr( idstr, '>' )) != NULL )
-			    *cp = 0;
-			else
-			  fprintf( stderr, "bad ID??\nid=%s\n", idstr );
-		    }
-		    break;
-		  case 'T':
-		    if ( !strncmp( buf, "<TEXT>", 6 )) {
-			if ( ! *idstr ) {
-			    fprintf( stderr, "No DOCID string -- quitting.\n" );
-			    exit(1);
-			}
-			inText = 1;
-			pid = 0;
-		    }
-		    break;
-		  default:
-		    break;
-		}
-	    puts( buf );
-	}
-	else {
-	    if ( buf[0] == '<' )
-		switch ( buf[1] )
-		{
-		  case 'p':
-		    pid++;
-		    printf( "<p id=%s.%d>\n", idstr, pid );
-		    break;
-		  case '/':
-		    if ( !strncmp( buf, "</TEXT>", 7 ))
-		      inText = 0;
-		    puts( buf );
-		    break;
-		  default:
-		    if (( !strncmp( buf, "<speaker>", 9 ))
-		        || ( !strncmp( buf, "<comment>", 9 )))
-		      {
-			puts( buf );
-		      }
-		    else
-		      {
-			fprintf( stderr, "Warning: passing odd markup in %s:\n\t%s\n", idstr, buf );
-			puts( buf );
-		      }
-		}
-	    else {
-		strcat( buf, " " );
-		sentBreak( buf );
-	    }
-	}
-    }
-    exit(0);
-}
-
-
-char *ucs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-char *lcs = "abcdefghijklmnopqrstuvwxyz";
-char *crp_abbrv[] = { "CORP", "INC", "CO", "PLC", "LTD", "BHD", "CIE",
-		      "DEPT", "LTDA", "MFG", "SPA" };
-int n_crp_abbrv = 11;
-char *time_zone[] = { "EST", "EDT", "PST", "PDT", "CST", "CDT", "MST", "MDT", "GMT" };
-int n_time_zone = 9;
-
-#define MAXWDLEN 64
-#define DoNextPeriod continue
-
-sentBreak( buf )
-  char *buf;
-{
-    char *period[MAXBRKS], *start, perchr, nxtwd[MAXWDLEN];
-    char *nxtch, *nxtuc, *nxtsp, *prvch, *prvsp, *endwd, *prvwd, *endpg;
-    char *openbracketp;
-    int n_per, i, j, k;
-
-    n_per = 0;
-    nxtuc = start = buf;
-    endpg = buf + strlen( buf ) -1;
-
- /* Locate all possible sentence terminations in this paragraph;
-  * if none, print what we have as a sentence.
-  */
-    openbracketp=0;
-    for(nxtsp = buf; *nxtsp != NULL ; nxtsp++)
-      switch (*nxtsp)
-	{
-	case '[':
-	  if ( strchr(nxtsp,']') != NULL )
-	    openbracketp=nxtsp;
-	  break;
-	case ']':
-	  if (openbracketp && n_per
-	      && period[n_per-1]+4 > openbracketp
-	      && strchr(".!?",*(nxtsp-1)))
-	    period[n_per-1]=nxtsp-1;
-	  openbracketp=0;
-	  break;
-	case '.':
-	case '?':
-	case '!':
-	  if (openbracketp) continue;
-	  period[n_per++] = nxtsp;
-	  if (n_per >= MAXBRKS)
-	    {
-	      fprintf(stderr,
-		      "MAXBRKS exceeded - more than %d `periods' in\n%s\n",
-		      MAXBRKS, buf);
-	      exit(-1);
-	    }
-	  break;
-	default:
-	  break;
-	}
-    
-    if ( ! n_per ) {
-        /* if ( endpg - buf > 3 && strchr(( endpg-2 ), ':' ) != NULL ) */
-        tagSentence( buf, endpg );
-	return;
-    }
-
- /* Check each possible sentence break, using a variety of
-  * heuristics...  At each stage, if evidence indicates a
-  * clear decision, write the tagged sentence if appropriate,
-  * and continue on to the next candidate.
-  */
-    for ( i=0; i<n_per; i++ )
-    {
-	nxtch = period[i];
-	prvch = period[i] -1;
-
- /* For this to be a valid break, there must be a space
-  * and an upper-case letter following
-  */
-	if (( nxtuc = strpbrk( period[i], ucs )) == NULL ||
-	    ( nxtsp = strchr( period[i], ' ' )) == NULL )
-	    DoNextPeriod;
-
- /* If a digit or other punctuation follows before the next
-  * space, this cannot be a sentence break (this handles
-  * medial periods in strings of initials, like "U.S.", "p.m."
-  */
-	while ( ++nxtch < nxtsp )
-	    if ( strchr( ".,;:-?!'0123456789", *nxtch ))
-		break;
-	if ( nxtch < nxtsp && ( *nxtch != '\'' || strchr( " st", *(nxtch+1) )))
-	    DoNextPeriod;
-	else
-	{
- /* Before going on, check whether nxtuc precedes nxtsp; if so,
-  * this is probably a typo (space after period was elided);
-  * we should fix it and continue to treat this as a candidate
-  */
-	    if ( nxtuc < nxtsp ) {
-		for ( endwd = ++endpg; endwd > period[i]; endwd-- )
-		    *(endwd+1) = *endwd;
-		*(++endwd) = ' ';
-		for ( j=i+1; j<n_per; j++ )
-		    period[j]++;
-		nxtsp = nxtuc++;
-            }
-        }
-
- /* Make sure nxtsp points as far to the right as possible
-  * before checking distance to nxtuc; allowable distance is
-  * up to 3 chars, to allow for intervening quote and paren.
-  * (but don't allow an intervening space)
-  * (Now allows for many intervening bracketed expressions as well.)
-  */
-	while ( *( nxtsp +1 ) == ' ' )
-	    nxtsp++;
-	if (( nxtuc > nxtsp + 3 ||
-	      ( nxtuc == nxtsp + 3 && *( nxtuc -1 ) == ' ' ))
-	    && (! (( *(nxtsp+1) == '[' )
-		   && ( strchr( nxtsp, ']') +2 == nxtuc )
-		   && ( strchr( ".!?", *(nxtuc-3)) == NULL))))
-	    DoNextPeriod;
-
- /* If next token after period is a corporate abbrev, this is
-  * not a break
-  */
-	j = k = 0;
-	while ( k < MAXWDLEN && nxtuc[j] != ' ' ) {
-	    if ( isalpha( nxtuc[j] ))
-		nxtwd[k++] = toupper( nxtuc[j] );
-	    j++;
-	}
-	if ( k < MAXWDLEN ) {
-	    nxtwd[k] = 0;
-	    for ( j=0; j<n_crp_abbrv; j++ )
-		if ( !strcmp( nxtwd, crp_abbrv[j] ))
-		    break;
-	    if ( j < n_crp_abbrv )
-		DoNextPeriod;
-	} else {
-	    fprintf( stderr, "TYPO? <p id=%s.%d> %s\n", idstr, pid, start );
-	    DoNextPeriod;
-	}
-
- /* Inspect the token that precedes the period
-  */
-	perchr = *period[i];
-	*period[i] = 0;
-
-	if (( prvsp = strrchr( start, ' ' )) != NULL )
-	{
-
- /* This block looks at a pre-break token that is not sentence-initial.
-  * Make sure we point to the first alphanumeric character, if any
-  */
-	    endwd = prvsp +1;
-	    while ( *endwd && !isalnum( *endwd ))
-		*endwd++;
-	    if ( ! *endwd ) { /* This was probably an ellipsis "..." */
-		*period[i] = perchr;
-		tagSentence( start, nxtsp );
-		start = nxtsp + 1;
-		DoNextPeriod;
-	    }
-    
- /* - if token ends in a bracket or quote, this is a clear sentence break
-  */
-	    if ( strchr( "\")}]", *prvch ))
-	    {
-		*period[i] = perchr;
-		tagSentence( start, nxtsp );
-		start = nxtsp + 1;
-		DoNextPeriod;
-	    }
-
- /* - if token does not begin with upper-case, and is not a time designation
-  *	("a.m" or "p.m") followed by a time-zone name, and is not "vs" or "excl",
-  *	then this is a real break
-  */
-	    if ( !isupper( *endwd )) {
-		if ( strstr( endwd, ".m" )) {
-		    for ( j=0; j<n_time_zone; j++ )
-			if ( !strcmp( nxtwd, time_zone[j] ))
-			    break;
-		    if ( j < n_time_zone ) {
-			*period[i] = perchr;
-			DoNextPeriod;
-		    }
-		}
-		if ( strcmp( endwd, "vs" ) && strcmp( endwd, "excl" )) {
-		    *period[i] = perchr;
-		    tagSentence( start, nxtsp );
-		    start = nxtsp + 1;
-		}
-		*period[i] = perchr;
-		DoNextPeriod;
-	    }
-
- /* - if it is one of the definite within-sentence abbrevs,
-  *	this clearly is not a sentence break
-  */
-	    for ( j=0; j<n_mid_abbrevs; j++ )
-		if ( !strcasecmp( endwd, abbrevs[j] ))
-		    break;
-	    if ( j < n_mid_abbrevs ) {
-		*period[i] = perchr;
-		DoNextPeriod;
-	    }
-
- /* - if it is "Dr" or "St", preceded by a capitalized word,
-  *	with only a space intervening, this could be a valid break,
-  *	but unlikely -- just issue a warning and don't call it a break
-  */
-	    for ( ; j<n_abbrevs; j++ )
-		if ( !strcasecmp( endwd, abbrevs[j] ))
-		    break;
-	    if ( j < n_abbrevs ) {
-		*prvsp = 0;
-		prvwd = strrchr( start, ' ' );
-		if ( prvwd == NULL ) {
-		    *prvsp = ' ';
-		    *period[i] = perchr;
-		    DoNextPeriod;
-		}
-		while ( *prvwd && !isalpha( *prvwd ))
-		    prvwd++;
-		
-		if ( ! *prvwd || !isupper( *prvwd ) ||
-		      strpbrk( prvwd, ",.:;\"')]}" ) ||
-		      strchr( "{[(\"`", *(prvsp+1) )) {
-		    *prvsp = ' ';
-		    *period[i] = perchr;
-		    DoNextPeriod;
-		}
-		*prvsp = ' ';
-		*period[i] = perchr;
-		fprintf( stderr, "ADR? <p id=%s.%d> %s\n", idstr, pid, start );
-		DoNextPeriod;
-	    }
-
- /* - if it is a single letter, this is almost certainly
-  *	not a real break (it's a first or middle initial)
-  */
-	    if ( strlen( endwd ) == 1 ) {
-		*period[i] = perchr;
-		DoNextPeriod;
-	    }
-
- /* At this point, we are looking at a non-initial multi-char token that
-  * begins with upper-case, is not a clear mid-sentence abbrev, and is
-  * followed by a capitalized word that is not a corporate abbrev.
-  * If the "period" character is actually "?" or "!", OR (the token
-  * contains lower case and, if a corp-abbrev, is not followed by "(")
-  * then this is almost certainly a real break (if it is a corp-abbrev
-  * followed by "(", this is most likely not a break)
-  */
-	    if ( perchr != '.' ) {
-		*period[i] = perchr;
-		tagSentence( start, nxtsp );
-		start = nxtsp + 1;
-		DoNextPeriod;
-	    }
-	    if ( strpbrk( endwd, lcs )) {
-		for ( j=0; j<n_crp_abbrv; j++ )
-		    if ( !strcasecmp( endwd, crp_abbrv[j] ))
-			break;
-		*period[i] = perchr;
-		if ( j == n_crp_abbrv || *(nxtsp+1) != '(' ) {
-		    tagSentence( start, nxtsp );
-		    start = nxtsp + 1;
-		}
-		DoNextPeriod;
-	    }
-
- /* Now we reach the truly ambiguous case: a sequence of upper-case
-  * (possibly initials) followed by a capitalized token (e.g. "U.S.
-  * Treasury" or "A.G. Edwards"; if it is an acronym (e.g. "NASA"),
-  * this is most likely a real break.
-  */
-	    if ( strspn( endwd, ucs ) == strlen( endwd )) {
-		*period[i] = perchr;
-		tagSentence( start, nxtsp );
-		start = nxtsp + 1;
-		DoNextPeriod;
-	    }
-
- /* Finally, we must determine whether the next token is likely to
-  * be a sentence-initial word, rather than a continuation of a
-  * proper name (i.e. "U.S. The" vs. "U.S. Navy" -- failing this
-  * criterion does not mean we don't have a break, but the error
-  * rate of calling it a non-break is diminished.  As a result, the
-  * predominant error should be run-on sentences (missed breaks).
-  */
-	    si_node.wd = nxtwd;
-	    if ( bsearch((char *)(&si_node), (char *)s_init_wd, n_s_init,
-			 sizeof(si_node), w_compare ))
-	    {
-		*period[i] = perchr;
-		tagSentence( start, nxtsp );
-		start = nxtsp + 1;
-	    }
-	    else
-	    {
-		*period[i] = perchr;
-		fprintf( tfp, "%s <%s.%d>\n", nxtwd, idstr, pid );
-	    }
-	    DoNextPeriod;
-
-	} /* prvsp != NULL */
-
-	else
-
-	{ /* prvsp == NULL */
- /* This block looks at a sentence-initial token preceding
-  * the period; if "period" is acually "?!", or if the token
-  * looks like any kind of abbreviation, this is not a real break
-  */
-	    if ( perchr != '.' ) {
-		*period[i] = perchr;
-		tagSentence( start, nxtsp );
-		start = nxtsp + 1;
-		DoNextPeriod;
-	    }
-	    endwd = start;
-	    while ( *endwd && !isalpha( *endwd ))
-		endwd++;
-	    if ( ! *endwd ) {
-		*period[i] = perchr;
-		DoNextPeriod;
-	    }
-	    for ( j=0; j<n_abbrevs; j++ )
-		if ( !strcasecmp( endwd, abbrevs[j] ))
-		    break;
-	    if ( j < n_abbrevs || strlen( endwd ) == 1 || strchr( endwd, '.' )) {
-		*period[i] = perchr;
-		DoNextPeriod;
-	    }
-	    *period[i] = perchr;
-	    tagSentence( start, nxtsp );
-	    start = nxtsp + 1;
-	    DoNextPeriod;
-	}
-    } /* for (i=0; i<n_per; i++ ) */
-
-/* If there is still character data in the buffer, call it a sentence
- */
-    if ( start + 2 < endpg )
-      if ( *endpg == ' ')
-	tagSentence( start, endpg, idstr );
-      else
-	tagSentence( start, (endpg + 1), idstr );
-}
-
-
-tagSentence( start, end )
-  char *start, *end;
-{
-    char sent[MAXSENTLEN], *si, *so;
-    int alpha, len;
-
-    len = (end - start) + 2;
-    if ( len > MAXSENTLEN )
-      {
-	fprintf( stderr, "Warning: in %s, ", idstr );
-	fprintf(stderr,"sentence length of %d exceeds MAXSENTLEN (%d)\n",
-		len,MAXSENTLEN);
-	strncpy(sent,start,75);
-	sent[75]=0;
-	fprintf(stderr,"ignoring `sentence' beginning with:\n %s\n",
-		sent);
-	return;
-      }
-
-    si = start;
-    so = sent;
-    alpha = 0;
-
-    while ( si < end ) {
-	alpha |= (! isspace( *si ));
-	*so++ = *si++;
-    }
-    *so = 0;
-
-    if ( ! alpha )
-	return;
-
-    printf( "<s>\n%s\n", sent );
-}
-
-/*
-unpicky_tagSentence( start, end )
-  char *start, *end;
-{
-    if ( start >= end ) {
-      fprintf( stderr, "ignoring bad sentence mark (%x !< %x) in %s\n",
-	       start, end, idstr );
-      fprintf( stderr, "`sentence' from start-pointer:\n%s\n", start );
-      return;
-    }
-    printf("<s>\n");
-    while ( start < end )
-	putchar(*start++);
-    putchar('\n');
-}
-*/
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
deleted file mode 100755
index 947ee28e2dc..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/perl -pi.old-char
-
-# handles nonprinting characters in Broadcast News material, to the extent
-# that they can be handled, and perhaps a bit beyond...
-
-tr/\xc4\x82\x90\xa4\x89\x8a\x87\xe9/-eEneece/;
-
-s=\xae=<<=g;
-s=\xaf=>>=g;
-s=\xab= 1/2=g;
-s=\xac= 1/4=g;
-s=\xf8= degrees=g;
-s=\xf1= plus or minus =g;
diff --git a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl b/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
deleted file mode 100755
index 8dc87917c0c..00000000000
--- a/egs/hub4_english/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/perl -p
-
-# handles nonprinting characters in Broadcast News material, to the extent
-# that they can be handled, and perhaps a bit beyond...
-
-s=\xc4=-=g;
-s=\xae=<<=g;
-s=\xaf=>>=g;
-s=\x82=e=g;	# e' (�) in IBMPC
-s=\xab= 1/2=g;
-# next most frequent, \xfa, appears to have various use as hard-space,
-#  hard-return, or noise
-s=\x90=E=g;	# E' (�) in IBMPC
-s=\xa4=n=g;	# n~ (�) in IBMPC
-s=\xac= 1/4=g;
-# ^G => noise
-# ^A => noise
-s=\xf8= degrees=g;
-# \x1b => noise?
-# \x02 => noise?
-
-# remainder occur 4 or fewer times each -- may be better to do by hand?
-s=\x89=e=g;	# e: or E:
-s=\xf1= plus or minus =g;
-# \xc9 = graphics character => ???
-# \x03 => noise?
-# \x04 => noise?
-s=\x8a=e=g;	# e` (�) in IBMPC
-s=\x87=c=g;	# c, (�) in IBMPC
-s=\xe9=e=g;	# e' (�) in ISO!!
-# \xad => spanish inverted question mark (�), appears (with Spanish) twice!
-s=\xad==g;
-
-# remainder occur only once each -- probably best to check by hand
-# \xff
-# \xdd
-# \xbb
-# \xa1
-# \x8d
-# \x81
-# \x1c
-# \x1a
-# \x16
-# \x11
-# \x10
-# \x0c
diff --git a/egs/hub4_english/s5/local/data_prep/do-lm-csr96 b/egs/hub4_english/s5/local/data_prep/do-lm-csr96
deleted file mode 100755
index eec6791904f..00000000000
--- a/egs/hub4_english/s5/local/data_prep/do-lm-csr96
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/sh
-# $Id: do-lm,v 1.3 1996/08/23 22:43:23 robertm Rel $
-Usage()
-{
-cat << EOM 1>&2
-Usage: $0 file(s)
-  Runs LM pipeline on FILES, with output to "lm" subdirectory of cwd.
-  Expects to find LM conditioning tools in PATH or ./bin.
-EOM
-}
-
-# Excludes "fixvp" stage which has the main effect of killing off
-# any SGML tagging that contains a space, e.g. <p id=...>.
-
-# BBN used -np switch for puncproc, removing punctuation; this chooses the
-# "verbalize" option instead.
-
-# Includes new "numhack" module to deal with zip codes and phone numbers.
-
-if [ $# -eq 0 ] || [ $1 = "-h" ]; then
-	Usage
-	exit 1
-fi
-
-PATH=$PATH:./bin ; export PATH
-
-for file in $*
-do
-	BASENM=`basename $file`
-	echo "Running LM pipeline for |$BASENM|..." 1>&2
-	set -x
-	perl pare-sgml.perl $file |
-	 perl bugproc.perl |
-	 perl numhack.perl |
-	 perl numproc.perl |
-	 perl abbrproc.perl |
-	 perl puncproc.perl > lm/$BASENM
-	set +x
-	echo "Done with $BASENM."
-done
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
index 34cd65ee15a..7e6a4f30b9e 100755
--- a/egs/hub4_english/s5/run.sh
+++ b/egs/hub4_english/s5/run.sh
@@ -13,16 +13,40 @@ set -o pipefail
 mfccdir=`pwd`/mfcc
 nj=40
 
+# 1996 English Broadcast News Train (HUB4)
+HUB4_96_Train_Transcripts=/export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+HUB4_96_Train_Speech=/export/corpora/LDC/LDC97S44
+# 1997 English Broadcast News Train (HUB4)
+HUB4_97_Train_Transcripts=/export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+HUB4_97_Train_Speech=/export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+# 1996 CSR HUB4 Language Model
+CSR_HUB4_LM=/export/corpora/LDC/LDC98T31/1996_csr_hub4_model
+# 1995 CSR-IV HUB4 corpus
+CSR95_HUB4=/export/corpora5/LDC/LDC96S31/csr95_hub4
+# North American News Text Corpus
+NA_Text=/export/corpora/LDC/LDC95T21
+# North American News Text Supplement Corpus
+NA_Text_Supp=/export/corpura/LCD/LDC98T30/northam_news_txt_sup
+# 1996 English Broadcast News Dev and Eval (HUB4)
+HUB4_96_Eval=/export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval
+# 1997 HUB4 English Evaluation corpus
+HUB4_97_Eval=/export/corpora/LDC/LDC2002S11/hub4e_97
+# 1998 HUB4 Broadcast News Evaluation English Test Material
+HUB4_98_Eval=/export/corpora/LDC/LDC2000S86
+# 1999 HUB4 Broadcast News Evaluation English Test Material
+HUB4_99_Eval=/export/corpora5/LDC/LDC2000S88/hub4_1999 
+
 # Prepare 1996 English Broadcast News Train (HUB4)
 local/data_prep/prepare_1996_bn_data.sh \
-  /export/corpora/LDC/LDC97T22/hub4_eng_train_trans \
+  $HUB4_96_Train_Transcripts \
+  $HUB4_96_Train_Speech \
   /export/corpora/LDC/LDC97S44 \
   data/local/data/train_bn96
 
 # Prepare 1997 English Broadcast News Train (HUB4)
 local/data_prep/prepare_1997_bn_data.sh \
-  /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 \
-  /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 \
+  $HUB4_97_Train_Transcripts \
+  $HUB4_97_Train_Speech \
   data/local/data/train_bn97
 
 # Install Beautiful Soup 4 python package
@@ -32,7 +56,7 @@ if [ ! -d tools/beautifulsoup4 ]; then
 fi
 export PYTHONPATH=$PWD/tools/beautifulsoup4:$PYTHONPATH
 
-if [ ! -f /export/corpora/LDC/LDC98T31/1996_csr_hub4_model/utils.tar ]; then
+if [ ! -f $CSR_HUB4_LM/utils.tar ]; then
   echo "Expected CSR-IV utils.tar to be found"
   exit 1
 fi
@@ -40,41 +64,44 @@ fi
 mkdir -p tools/csr4_utils
 (
 cd tools/csr4_utils
-tar -xvf /export/corpora/LDC/LDC98T31/1996_csr_hub4_model/utils.tar
+tar -xvf $CSR_HUB4_LM/utils.tar
 )
 
+chmod a+w tools/csr4_utils
+patch -u -d tools/csr4_utils -p3 < local/data_prep/csr4_utils.patch
+
 # Prepare 1995 CSR-IV HUB4 corpus
 local/data_prep/prepare_1995_csr_hub4_corpus.sh \
-  /export/corpora5/LDC/LDC96S31/csr95_hub4/ data/local/data/csr95_hub4
+  $CSR95_HUB4 data/local/data/csr95_hub4
 
 # Prepare North American News Text Corpus
 local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
-  /export/corpora/LDC/LDC95T21 data/local/data/na_news
+   $NA_Text data/local/data/na_news
 
 # Prepare North American News Text Supplement Corpus
 local/data/prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
-  /export/corpura/LCD/LDC98T30/northam_news_txt_sup data/local/data/na_news_supp
+  $NA_Text_Supp data/local/data/na_news_supp
 
 # Prepare 1996 CSR HUB4 Language Model
 local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
-   data/local/data/csr96_hub4
+   $CSR_HUB4_LM data/local/data/csr96_hub4
 
 # Prepare 1996 English Broadcast News Dev and Eval (HUB4)
 local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
-  /export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval \
+  $HUB4_96_Eval \
   data/local/data/hub4_96_dev_eval
 
 # Prepare 1997 HUB4 English Evaluation corpus
 local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \
-  /export/corpora/LDC/LDC2002S11/hub4e_97 data/local/data/eval97
+  $HUB4_97_Eval data/local/data/eval97
 
 # Prepare 1998 HUB4 Broadcast News Evaluation English Test Material
 local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \
-  /export/corpora/LDC/LDC2000S86/ data/local/data/eval98
+  $HUB4_98_Eval data/local/data/eval98
 
 # Prepare 1999 HUB4 Broadcast News Evaluation English Test Material
 local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \
-  /export/corpora5/LDC/LDC2000S88/hub4_1999 data/local/data/eval99
+  $HUB4_99_Eval data/local/data/eval99
 
 local/format_data.sh 
 

From 5edf4645d0ec8c05601f14f28f993e47b7c57d34 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 27 Nov 2017 17:16:46 -0500
Subject: [PATCH 21/38] bn: Cleaning up

---
 .../data_prep/prepare_1995_csr_hub4_corpus.sh |   6 -
 .../s5/local/tuning/run_segmentation_wsj_a.sh |   6 -
 .../s5/local/tuning/run_segmentation_wsj_b.sh | 346 ------------------
 .../s5/local/tuning/run_segmentation_wsj_d.sh |  13 +-
 .../s5/local/tuning/run_segmentation_wsj_e.sh |   6 +-
 .../s5/local/tuning/run_segmentation_wsj_f.sh |   8 +-
 6 files changed, 3 insertions(+), 382 deletions(-)
 delete mode 100755 egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh

diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
index 92215b7f058..afa6d7e6531 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
@@ -45,12 +45,6 @@ ls $SOURCE_DIR/csr95/h4/train/*.txt > $dir/train95_text.list
 ls $SOURCE_DIR/csr95/h4/devtst/*.txt > $dir/dev95_text.list
 ls $SOURCE_DIR/csr95/h4/evltst/*.txt > $dir/eval95_text.list
 
-#local/data_prep/parse_sgm_1995_csr_hub4.pl $dir/train95_text.list > $dir/train95_transcripts.txt 2> $dir/parse_sgml_train95.log || exit 1
-#local/data_prep/parse_sgm_1995_csr_hub4.pl $dir/dev95_text.list > $dir/dev95_transcripts.txt 2> $dir/parse_sgml_dev95.log || exit 1
-#local/data_prep/parse_sgm_1995_csr_hub4.pl $dir/eval95_test.list > $dir/eval95_transcripts.txt 2> $dir/parse_sgml_eval95.log || exit 1
-#
-#exit 0
-
 for x in `ls $SOURCE_DIR/csr95/h4/*/*.txt`; do
   if [[ $x =~ "csr95/h4/train" ]]; then
     local/data_prep/process_1995_bn_annotation.py $x \
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
index 68177169f81..da135676ea1 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
@@ -22,7 +22,6 @@ new_affix=2a
 
 . utils/parse_options.sh
 
-false && {
 ###############################################################################
 ## Simulate unsegmented data directory.
 ###############################################################################
@@ -60,7 +59,6 @@ steps/align_si.sh --nj $nj --cmd "$train_cmd" \
 steps/train_sat.sh --cmd "$train_cmd" \
   4000 42000 \
   data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-}
 
 ###############################################################################
 # Segment long recordings using TF-IDF retrieval of reference text 
@@ -68,7 +66,6 @@ steps/train_sat.sh --cmd "$train_cmd" \
 # Use a SAT model trained on train_si284 (wsj_tri3)
 ###############################################################################
 
-true && {
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
   --stage $segment_stage \
   --config conf/segment_long_utts.conf --align-full-hyp false \
@@ -114,7 +111,6 @@ for dset in eval97.pem; do
     data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
     exp/tri4_${affix}/decode_nosp_${dset}_rescore
 done
-}
 
 ###############################################################################
 # Segment long recordings using TF-IDF retrieval of reference text 
@@ -122,7 +118,6 @@ done
 # Use a SAT model trained on tri4_a
 ###############################################################################
 
-true && {
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
   --stage $segment_stage \
   --config conf/segment_long_utts.conf --align-full-hyp false \
@@ -156,7 +151,6 @@ for dset in eval97.pem; do
     data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
     exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
 done
-}
 
 cleanup_stage=-1
 cleanup_affix=cleaned
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
deleted file mode 100755
index 6989be69650..00000000000
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
+++ /dev/null
@@ -1,346 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016  Vimal Manohar
-# Apache 2.0
-
-set -e
-set -o pipefail
-
-# This script demonstrates how to re-segment long audios into short segments.
-# The basic idea is to decode with an existing in-domain acoustic model, and a
-# bigram language model built from the reference, and then work out the
-# segmentation from a ctm like file.
-# This is similar to _a but uses a diarized version of train data directory. 
-
-. ./cmd.sh
-. ./path.sh
-
-segment_stage=-14
-nj=40
-reco_nj=80
-affix=b
-new_affix=2b
-
-false && {
-###############################################################################
-## Simulate unsegmented data directory.
-###############################################################################
-utils/data/convert_data_dir_to_whole.sh data/train data/train_long
-
-steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
-  data/train_long exp/make_mfcc/train_long mfcc || exit 1
-steps/compute_cmvn_stats.sh data/train_long \
-  exp/make_mfcc/train_long mfcc
-utils/fix_data_dir.sh data/train_long
-
-###############################################################################
-## Train WSJ models.
-###############################################################################
-
-steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
-
-steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-}
-
-###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on train_si284 (wsj_tri3)
-###############################################################################
-
-true && {
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp false \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
-  exp/wsj_tri3 data/lang_nosp data/train_reseg_a_diarized \
-  data/train_long/text \
-  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
-  data/train_reseg_${affix}_spk30sec
-steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
-  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
-    exp/tri4_${affix}/decode_nosp_${dset}_rescore
-done
-}
-
-
-
-exit 0
-
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-# Align tri2b system with reseg${affix} data
-steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
-
-steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
-
-affix=d
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj 80 \
-  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
-  exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix} data/lang_nosp \
-  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
-    exp/tri3${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
-  exp/tri4${affix}/graph_nosp
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
-    exp/tri4${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from a better model
-# (tri3b)
-###############################################################################
-
-# Align tri3b system with reseg data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_a : Cleanup the segmented data directory using tri3b model.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
-
-###############################################################################
-# Train new model on the cleaned_a data directory
-###############################################################################
-
-# Align tri3b system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_b
-
-###############################################################################
-# Train new model on the cleaned_b data directory
-###############################################################################
-
-# Align tri3c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_c
-
-###############################################################################
-# Train new model on the cleaned_c data directory
-###############################################################################
-
-# Align tri4c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh
index 6f73ced05ed..fc22b4eddc0 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh
@@ -15,7 +15,7 @@ set -o pipefail
 . ./cmd.sh
 . ./path.sh
 
-segment_stage=8
+segment_stage=-8
 nj=40
 reco_nj=80
 affix=d
@@ -37,7 +37,6 @@ utils/fix_data_dir.sh data/train_long
 ###############################################################################
 ## Train WSJ models.
 ###############################################################################
-false && {
 
 steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
   data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
@@ -61,7 +60,6 @@ steps/align_si.sh --nj $nj --cmd "$train_cmd" \
 steps/train_sat.sh --cmd "$train_cmd" \
   4000 42000 \
   data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-}
 
 ###############################################################################
 # Segment long recordings using TF-IDF retrieval of reference text 
@@ -69,8 +67,6 @@ steps/train_sat.sh --cmd "$train_cmd" \
 # Use a SAT model trained on train_si284 (wsj_tri3)
 ###############################################################################
 
-true && {
-
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
   --stage $segment_stage \
   --config conf/segment_long_utts.conf --align-full-hyp true \
@@ -116,14 +112,12 @@ for dset in eval97.pem; do
     data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
     exp/tri4_${affix}/decode_nosp_${dset}_rescore
 done
-}
 
 ###############################################################################
 # Segment long recordings using TF-IDF retrieval of reference text 
 # for uniformly segmented audio chunks based on Smith-Waterman alignment.
 # Use a SAT model trained on tri4_a
 ###############################################################################
-true && {
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
   --stage $segment_stage \
   --config conf/segment_long_utts.conf --align-full-hyp true \
@@ -157,7 +151,6 @@ for dset in eval97.pem; do
     data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
     exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
 done
-}
 
 cleanup_stage=-1
 cleanup_affix=cleaned
@@ -166,8 +159,6 @@ cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
 dir=${srcdir}_${cleanup_affix}_work
 cleaned_dir=${srcdir}_${cleanup_affix}
 
-true && {
-
 steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
   --cmd "$train_cmd" \
   data/train_reseg_${new_affix} data/lang_nosp \
@@ -179,7 +170,6 @@ steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
 steps/train_sat.sh --cmd "$train_cmd" \
   5000 100000 $cleaned_data data/lang_nosp \
   ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
-}
 
 utils/mkgraph.sh data/lang_nosp_test \
   exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
@@ -199,7 +189,6 @@ done
 
 exit 0
 
-
 ###############################################################################
 # Train new model on segmented data directory starting from the same model
 # used for segmentation. (tri2b)
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh
index 0ac507c9d74..aa4c5e2f1ee 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh
@@ -23,7 +23,6 @@ new_affix=2e
 
 . utils/parse_options.sh
 
-false && {
 ###############################################################################
 ## Simulate unsegmented data directory.
 ###############################################################################
@@ -69,8 +68,7 @@ steps/train_sat.sh --cmd "$train_cmd" \
 # Use a SAT model trained on train_si284 (wsj_tri3)
 ###############################################################################
 
-true && {
-bash -x steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
   --stage $segment_stage \
   --config conf/segment_long_utts.conf --align-full-hyp false \
   --max-segment-duration 30 --overlap-duration 5 \
@@ -115,7 +113,6 @@ for dset in eval97.pem; do
     data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
     exp/tri4_${affix}/decode_nosp_${dset}_rescore
 done
-}
 
 ###############################################################################
 # Segment long recordings using TF-IDF retrieval of reference text 
@@ -123,7 +120,6 @@ done
 # Use a SAT model trained on tri4_a
 ###############################################################################
 
-true && {
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
   --stage $segment_stage \
   --config conf/segment_long_utts.conf --align-full-hyp false \
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh
index c56ce2da19b..667aa5265f2 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh
@@ -23,7 +23,6 @@ new_affix=2f
 
 . utils/parse_options.sh
 
-false && {
 ###############################################################################
 ## Simulate unsegmented data directory.
 ###############################################################################
@@ -61,7 +60,6 @@ steps/align_si.sh --nj $nj --cmd "$train_cmd" \
 steps/train_sat.sh --cmd "$train_cmd" \
   4000 42000 \
   data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-}
 
 ###############################################################################
 # Segment long recordings using TF-IDF retrieval of reference text 
@@ -69,8 +67,7 @@ steps/train_sat.sh --cmd "$train_cmd" \
 # Use a SAT model trained on train_si284 (wsj_tri3)
 ###############################################################################
 
-false && {
-bash -x steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
   --stage $segment_stage \
   --config conf/segment_long_utts.conf --align-full-hyp true \
   --max-segment-duration 30 --overlap-duration 5 \
@@ -115,7 +112,6 @@ for dset in eval97.pem; do
     data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
     exp/tri4_${affix}/decode_nosp_${dset}_rescore
 done
-}
 
 ###############################################################################
 # Segment long recordings using TF-IDF retrieval of reference text 
@@ -123,7 +119,6 @@ done
 # Use a SAT model trained on tri4_a
 ###############################################################################
 
-true && {
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
   --stage $segment_stage \
   --config conf/segment_long_utts.conf --align-full-hyp true \
@@ -157,7 +152,6 @@ for dset in eval97.pem; do
     data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
     exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
 done
-}
 
 cleanup_stage=-1
 cleanup_affix=cleaned

From fbcfa5598c9f1eb5e1f928737cc24b9c8828b575 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 27 Nov 2017 17:40:41 -0500
Subject: [PATCH 22/38] bn: Adding some results

---
 .../s5/local/tuning/run_segmentation_wsj_a.sh | 236 +---------
 .../s5/local/tuning/run_segmentation_wsj_b.sh | 203 +++++++++
 .../s5/local/tuning/run_segmentation_wsj_d.sh | 414 -----------------
 .../s5/local/tuning/run_segmentation_wsj_e.sh | 417 ------------------
 .../s5/local/tuning/run_segmentation_wsj_f.sh | 415 -----------------
 5 files changed, 215 insertions(+), 1470 deletions(-)
 create mode 100755 egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
 delete mode 100755 egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh
 delete mode 100755 egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh
 delete mode 100755 egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh

diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
index da135676ea1..fcb50eedc98 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
@@ -11,6 +11,18 @@ set -o pipefail
 # bigram language model built from the reference, and then work out the
 # segmentation from a ctm like file.
 
+%WER 19.1 | 728 32834 | 83.1 12.2 4.7 2.2 19.1 85.0 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 20.2 | 728 32834 | 81.9 13.0 5.1 2.1 20.2 87.1 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+%WER 20.4 | 728 32834 | 81.7 13.1 5.2 2.1 20.4 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+%WER 21.3 | 728 32834 | 80.7 13.7 5.6 2.0 21.3 87.1 | exp/tri4_2a/decode_nosp_eval97.pem/score_15_1.0/eval97.pem.ctm.filt.sys
+
+%WER 19.8 | 728 32834 | 82.1 12.6 5.3 1.9 19.8 85.9 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_15_1.0/eval97.pem.ctm.filt.sys
+%WER 20.9 | 728 32834 | 81.2 13.5 5.3 2.1 20.9 86.5 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+%WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+%WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
+
 . ./cmd.sh
 . ./path.sh
 
@@ -188,227 +200,3 @@ for dset in eval97.pem; do
 done
 
 exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-# Align tri2b system with reseg${affix} data
-steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
-
-steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
-
-affix=d
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj 80 \
-  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
-  exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix} data/lang_nosp \
-  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
-    exp/tri3${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
-  exp/tri4${affix}/graph_nosp
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
-    exp/tri4${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from a better model
-# (tri3b)
-###############################################################################
-
-# Align tri3b system with reseg data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_a : Cleanup the segmented data directory using tri3b model.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
-
-###############################################################################
-# Train new model on the cleaned_a data directory
-###############################################################################
-
-# Align tri3b system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_b
-
-###############################################################################
-# Train new model on the cleaned_b data directory
-###############################################################################
-
-# Align tri3c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_c
-
-###############################################################################
-# Train new model on the cleaned_c data directory
-###############################################################################
-
-# Align tri4c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
new file mode 100755
index 00000000000..2a2ee556d4b
--- /dev/null
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to re-segment long audios into short segments.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# bigram language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+# This is similar to _a but aligns full hypothesis with reference.
+
+%WER 19.0 | 728 32834 | 83.1 12.5 4.4 2.1 19.0 87.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+%WER 20.2 | 728 32834 | 82.1 13.4 4.5 2.3 20.2 89.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+%WER 19.9 | 728 32834 | 82.3 13.2 4.5 2.3 19.9 88.9 | exp/tri4_2b/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 21.2 | 728 32834 | 81.3 14.3 4.4 2.5 21.2 89.8 | exp/tri4_2b/decode_nosp_eval97.pem/score_12_0.0/eval97.pem.ctm.filt.sys
+
+%WER 19.8 | 728 32834 | 82.3 12.7 5.1 2.1 19.8 88.0 | exp/tri4_b/decode_nosp_eval97.pem_rescore/score_14_0.5/eval97.pem.ctm.filt.sys
+%WER 20.9 | 728 32834 | 81.2 13.4 5.4 2.1 20.9 88.7 | exp/tri4_b/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+%WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+%WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
+
+. ./cmd.sh
+. ./path.sh
+
+segment_stage=-8
+nj=40
+reco_nj=80
+affix=d
+new_affix=2d
+
+. utils/parse_options.sh
+
+###############################################################################
+## Simulate unsegmented data directory.
+###############################################################################
+utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
+  data/train_long exp/make_mfcc/train_long mfcc || exit 1
+steps/compute_cmvn_stats.sh data/train_long \
+  exp/make_mfcc/train_long mfcc
+utils/fix_data_dir.sh data/train_long
+
+###############################################################################
+## Train WSJ models.
+###############################################################################
+
+steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
+
+steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
+
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
+
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
+
+steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  4000 42000 \
+  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on modified Levenshtein alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3)
+###############################################################################
+
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp true \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/wsj_tri3 data/lang_nosp data/train_long data/train_long/text \
+  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+  exp/make_mfcc/train_reseg_${affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}
+
+utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
+  data/train_reseg_${affix}_spk30sec
+steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
+  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
+utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp \
+  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
+
+steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
+    exp/tri4_${affix}/decode_nosp_${dset}_rescore
+done
+
+###############################################################################
+# Segment long recordings using TF-IDF retrieval of reference text 
+# for uniformly segmented audio chunks based on modified Levenshtein alignment.
+# Use a SAT model trained on tri4_a
+###############################################################################
+
+steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+  --stage $segment_stage \
+  --config conf/segment_long_utts.conf --align-full-hyp true \
+  --max-segment-duration 30 --overlap-duration 5 \
+  --num-neighbors-to-search 1 --nj $reco_nj \
+  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
+  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
+
+steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
+  exp/make_mfcc/train_reseg_${new_affix} mfcc
+utils/fix_data_dir.sh data/train_reseg_${new_affix}
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
+
+utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
+    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
+done
+
+cleanup_stage=-1
+cleanup_affix=cleaned
+srcdir=exp/tri4_${new_affix}
+cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+  --cmd "$train_cmd" \
+  data/train_reseg_${new_affix} data/lang_nosp \
+  $srcdir $dir $cleaned_data
+
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  5000 100000 $cleaned_data data/lang_nosp \
+  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
+
+utils/mkgraph.sh data/lang_nosp_test \
+  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
+for dset in eval97.pem; do
+  this_nj=`cat data/$dset/spk2utt | wc -l`
+  if [ $this_nj -gt 20 ]; then
+    this_nj=20
+  fi
+  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    data/lang_nosp_test data/lang_nosp_test_rescore \
+    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
+    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+done
+
+exit 0
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh
deleted file mode 100755
index fc22b4eddc0..00000000000
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_d.sh
+++ /dev/null
@@ -1,414 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016  Vimal Manohar
-# Apache 2.0
-
-set -e
-set -o pipefail
-
-# This script demonstrates how to re-segment long audios into short segments.
-# The basic idea is to decode with an existing in-domain acoustic model, and a
-# bigram language model built from the reference, and then work out the
-# segmentation from a ctm like file.
-# This is similar to _a but aligns full hypothesis with reference.
-
-. ./cmd.sh
-. ./path.sh
-
-segment_stage=-8
-nj=40
-reco_nj=80
-affix=d
-new_affix=2d
-
-. utils/parse_options.sh
-
-###############################################################################
-## Simulate unsegmented data directory.
-###############################################################################
-utils/data/convert_data_dir_to_whole.sh data/train data/train_long
-
-steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
-  data/train_long exp/make_mfcc/train_long mfcc || exit 1
-steps/compute_cmvn_stats.sh data/train_long \
-  exp/make_mfcc/train_long mfcc
-utils/fix_data_dir.sh data/train_long
-
-###############################################################################
-## Train WSJ models.
-###############################################################################
-
-steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
-
-steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-
-###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on train_si284 (wsj_tri3)
-###############################################################################
-
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp true \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
-  exp/wsj_tri3 data/lang_nosp data/train_long data/train_long/text \
-  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
-  data/train_reseg_${affix}_spk30sec
-steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
-  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
-    exp/tri4_${affix}/decode_nosp_${dset}_rescore
-done
-
-###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on tri4_a
-###############################################################################
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp true \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
-  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
-  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
-  exp/make_mfcc/train_reseg_${new_affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${new_affix}
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
-    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
-done
-
-cleanup_stage=-1
-cleanup_affix=cleaned
-srcdir=exp/tri4_${new_affix}
-cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
-dir=${srcdir}_${cleanup_affix}_work
-cleaned_dir=${srcdir}_${cleanup_affix}
-
-steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
-  --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  $srcdir $dir $cleaned_data
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 $cleaned_data data/lang_nosp \
-  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
-
-utils/mkgraph.sh data/lang_nosp_test \
-  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
-done
-
-exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-# Align tri2b system with reseg${affix} data
-steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
-
-steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
-
-affix=d
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj 80 \
-  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
-  exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix} data/lang_nosp \
-  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
-    exp/tri3${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
-  exp/tri4${affix}/graph_nosp
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
-    exp/tri4${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from a better model
-# (tri3b)
-###############################################################################
-
-# Align tri3b system with reseg data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_a : Cleanup the segmented data directory using tri3b model.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
-
-###############################################################################
-# Train new model on the cleaned_a data directory
-###############################################################################
-
-# Align tri3b system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_b
-
-###############################################################################
-# Train new model on the cleaned_b data directory
-###############################################################################
-
-# Align tri3c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_c
-
-###############################################################################
-# Train new model on the cleaned_c data directory
-###############################################################################
-
-# Align tri4c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh
deleted file mode 100755
index aa4c5e2f1ee..00000000000
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_e.sh
+++ /dev/null
@@ -1,417 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016  Vimal Manohar
-# Apache 2.0
-
-set -e
-set -o pipefail
-
-# This script demonstrates how to re-segment long audios into short segments.
-# The basic idea is to decode with an existing in-domain acoustic model, and a
-# bigram language model built from the reference, and then work out the
-# segmentation from a ctm like file.
-# This is similart to _a, but uses a automatically segmented data directory.
-
-. ./cmd.sh
-. ./path.sh
-
-segment_stage=-8
-nj=40
-reco_nj=80
-affix=e
-new_affix=2e
-
-. utils/parse_options.sh
-
-###############################################################################
-## Simulate unsegmented data directory.
-###############################################################################
-utils/data/convert_data_dir_to_whole.sh data/train data/train_long
-
-steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
-  data/train_long exp/make_mfcc/train_long mfcc || exit 1
-steps/compute_cmvn_stats.sh data/train_long \
-  exp/make_mfcc/train_long mfcc
-utils/fix_data_dir.sh data/train_long
-
-###############################################################################
-## Train WSJ models.
-###############################################################################
-
-steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
-
-steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-}
-
-###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on train_si284 (wsj_tri3)
-###############################################################################
-
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp false \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
-  exp/wsj_tri3 data/lang_nosp data/train_long.seg_lstm_1e_sad_music data/train_long/text \
-  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
-  data/train_reseg_${affix}_spk30sec
-steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
-  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
-    exp/tri4_${affix}/decode_nosp_${dset}_rescore
-done
-
-###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on tri4_a
-###############################################################################
-
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp false \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
-  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
-  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
-  exp/make_mfcc/train_reseg_${new_affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${new_affix}
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
-    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
-done
-}
-
-cleanup_stage=-1
-cleanup_affix=cleaned
-srcdir=exp/tri4_${new_affix}
-cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
-dir=${srcdir}_${cleanup_affix}_work
-cleaned_dir=${srcdir}_${cleanup_affix}
-
-steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
-  --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  $srcdir $dir $cleaned_data
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 $cleaned_data data/lang_nosp \
-  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
-
-utils/mkgraph.sh data/lang_nosp_test \
-  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
-done
-
-exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-# Align tri2b system with reseg${affix} data
-steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
-
-steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
-
-affix=d
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj 80 \
-  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
-  exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix} data/lang_nosp \
-  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
-    exp/tri3${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
-  exp/tri4${affix}/graph_nosp
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
-    exp/tri4${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from a better model
-# (tri3b)
-###############################################################################
-
-# Align tri3b system with reseg data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_a : Cleanup the segmented data directory using tri3b model.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
-
-###############################################################################
-# Train new model on the cleaned_a data directory
-###############################################################################
-
-# Align tri3b system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_b
-
-###############################################################################
-# Train new model on the cleaned_b data directory
-###############################################################################
-
-# Align tri3c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_c
-
-###############################################################################
-# Train new model on the cleaned_c data directory
-###############################################################################
-
-# Align tri4c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh
deleted file mode 100755
index 667aa5265f2..00000000000
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_f.sh
+++ /dev/null
@@ -1,415 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016  Vimal Manohar
-# Apache 2.0
-
-set -e
-set -o pipefail
-
-# This script demonstrates how to re-segment long audios into short segments.
-# The basic idea is to decode with an existing in-domain acoustic model, and a
-# bigram language model built from the reference, and then work out the
-# segmentation from a ctm like file.
-# This is similar to _e, but aligns full hypothesis using modified-Levenshtein alignment.
-
-. ./cmd.sh
-. ./path.sh
-
-segment_stage=-8
-nj=40
-reco_nj=80
-affix=f
-new_affix=2f
-
-. utils/parse_options.sh
-
-###############################################################################
-## Simulate unsegmented data directory.
-###############################################################################
-utils/data/convert_data_dir_to_whole.sh data/train data/train_long
-
-steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
-  data/train_long exp/make_mfcc/train_long mfcc || exit 1
-steps/compute_cmvn_stats.sh data/train_long \
-  exp/make_mfcc/train_long mfcc
-utils/fix_data_dir.sh data/train_long
-
-###############################################################################
-## Train WSJ models.
-###############################################################################
-
-steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
-
-steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-
-###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on train_si284 (wsj_tri3)
-###############################################################################
-
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp true \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
-  exp/wsj_tri3 data/lang_nosp data/train_long.seg_lstm_1e_sad_music data/train_long/text \
-  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
-  data/train_reseg_${affix}_spk30sec
-steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
-  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
-    exp/tri4_${affix}/decode_nosp_${dset}_rescore
-done
-
-###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on tri4_a
-###############################################################################
-
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp true \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
-  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
-  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
-  exp/make_mfcc/train_reseg_${new_affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${new_affix}
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
-    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
-done
-
-cleanup_stage=-1
-cleanup_affix=cleaned
-srcdir=exp/tri4_${new_affix}
-cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
-dir=${srcdir}_${cleanup_affix}_work
-cleaned_dir=${srcdir}_${cleanup_affix}
-
-steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
-  --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  $srcdir $dir $cleaned_data
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 $cleaned_data data/lang_nosp \
-  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
-
-utils/mkgraph.sh data/lang_nosp_test \
-  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
-done
-
-exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-# Align tri2b system with reseg${affix} data
-steps/align_si.sh  --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_reseg_${affix}  || exit 1;
-
-steps/train_deltas.sh --cmd "$train_cmd" 2000 10000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri1${affix}
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri1${affix} exp/tri1${affix}_ali_reseg_${affix}
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" 3500 25000 \
-  data/train_reseg_${affix} data/lang_nosp exp/tri2${affix}
-
-affix=d
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj 80 \
-  exp/tri2a data/lang_nosp data/train_long data/train_reseg_${affix} \
-  exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-###############################################################################
-# Train new model on segmented data directory starting from the same model
-# used for segmentation. (tri2b)
-###############################################################################
-
-steps/align_si.sh --nj 40 --cmd "$train_cmd" \
-  data/train_reseg_${affix} \
-  data/lang_nosp exp/tri2b exp/tri2b_ali_reseg_${affix}  || exit 1;
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix} data/lang_nosp \
-  exp/tri2b_ali_reseg_${affix} exp/tri3${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri3${affix} exp/tri3${affix}/graph_nosp || exit 1;
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri3${affix}/graph_nosp data/$dset exp/tri3${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri3${affix}/decode_nosp_${dset} \
-    exp/tri3${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train data/lang_nosp exp/tri3${affix} exp/tri3${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train data/lang_nosp exp/tri3${affix}_ali exp/tri4${affix}
-
-(
-utils/mkgraph.sh data/lang_nosp_test exp/tri4${affix} \
-  exp/tri4${affix}/graph_nosp
-for dset in eval98.pem eval97.pem eval99_1.pem eval99_2.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4${affix}/graph_nosp data/$dset exp/tri4${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4${affix}/decode_nosp_${dset} \
-    exp/tri4${affix}/decode_nosp_${dset}_rescore
-done
-) &
-
-exit 0
-
-###############################################################################
-# Train new model on segmented data directory starting from a better model
-# (tri3b)
-###############################################################################
-
-# Align tri3b system with reseg data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix} data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}
-
-# Train SAT system on reseg data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri4c_reseg${affix}
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4c_reseg${affix}/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4c_reseg${affix}/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_a : Cleanup the segmented data directory using tri3b model.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix} exp/tri3b_work_si284_reseg${affix} data/train_si284_reseg${affix}_cleaned_a
-
-###############################################################################
-# Train new model on the cleaned_a data directory
-###############################################################################
-
-# Align tri3b system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp exp/tri3b \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_a data/lang_nosp \
-  exp/tri3b_ali_si284_reseg${affix}_cleaned_a exp/tri4d${affix}_cleaned_a
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_a exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_a/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_a/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_b : Cleanup the segmented data directory using the tri3c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri3c_reseg${affix} exp/tri3c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_b
-
-###############################################################################
-# Train new model on the cleaned_b data directory
-###############################################################################
-
-# Align tri3c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp exp/tri3c_reseg${affix} \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_b data/lang_nosp \
-  exp/tri3c_reseg${affix}_ali_si284_reseg${affix}_cleaned_b exp/tri4d${affix}_cleaned_b
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_b exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_b/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_b/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-###############################################################################
-# cleaned_c : Cleanup the segmented data directory using the tri4c_reseg
-# model, which is a like a first-pass model trained on the resegmented data.
-###############################################################################
-
-steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" \
-  --nj 80 \
-  data/train_si284_reseg${affix} data/lang_nosp \
-  exp/tri4c_reseg${affix} exp/tri4c_reseg${affix}_work_si284_reseg${affix} \
-  data/train_si284_reseg${affix}_cleaned_c
-
-###############################################################################
-# Train new model on the cleaned_c data directory
-###############################################################################
-
-# Align tri4c_reseg system with cleaned data
-steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp exp/tri4c_reseg${affix} \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c
-
-# Train SAT system on cleaned data
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284_reseg${affix}_cleaned_c data/lang_nosp \
-  exp/tri4c_reseg${affix}_ali_si284_reseg${affix}_cleaned_c exp/tri4d${affix}_cleaned_c
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4d${affix}_cleaned_c exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4d${affix}_cleaned_c/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4d${affix}_cleaned_c/decode_nosp_tgpr_eval92 || exit 1;
-) &
-

From 2fe21ad9c905e6439e3637b5589fad05a8105585 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 27 Nov 2017 17:53:55 -0500
Subject: [PATCH 23/38] bn: Remove some options

---
 .../s5/local/tuning/run_segmentation_wsj_a.sh | 43 ++++++++++-------
 .../s5/local/tuning/run_segmentation_wsj_b.sh | 47 +++++++++++--------
 2 files changed, 54 insertions(+), 36 deletions(-)

diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
index fcb50eedc98..edd38d85cf4 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
@@ -7,21 +7,34 @@ set -e
 set -o pipefail
 
 # This script demonstrates how to re-segment long audios into short segments.
-# The basic idea is to decode with an existing in-domain acoustic model, and a
-# bigram language model built from the reference, and then work out the
-# segmentation from a ctm like file.
+# The basic idea is to decode with an existing out-of-domain WSJ GMM model, 
+# and a 4-gram language model built from the reference, and then work out the
+# segmentation from a ctm like file. This is used to build a stage 1 model
+# that is used to decode and re-segment the long audio again to train a 
+# stage 2 model. This is followed by a clean-up stage to get cleaned 
+# transcripts.
 
-%WER 19.1 | 728 32834 | 83.1 12.2 4.7 2.2 19.1 85.0 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
-%WER 20.2 | 728 32834 | 81.9 13.0 5.1 2.1 20.2 87.1 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
 
-%WER 20.4 | 728 32834 | 81.7 13.1 5.2 2.1 20.4 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-%WER 21.3 | 728 32834 | 80.7 13.7 5.6 2.0 21.3 87.1 | exp/tri4_2a/decode_nosp_eval97.pem/score_15_1.0/eval97.pem.ctm.filt.sys
 
+# Results using WSJ models
+%WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+%WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
+
+# Audio-transcript alignment stage 1
 %WER 19.8 | 728 32834 | 82.1 12.6 5.3 1.9 19.8 85.9 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_15_1.0/eval97.pem.ctm.filt.sys
 %WER 20.9 | 728 32834 | 81.2 13.5 5.3 2.1 20.9 86.5 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
 
-%WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
-%WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
+# Audio-transcript alignment stage 2
+%WER 20.4 | 728 32834 | 81.7 13.1 5.2 2.1 20.4 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+%WER 21.3 | 728 32834 | 80.7 13.7 5.6 2.0 21.3 87.1 | exp/tri4_2a/decode_nosp_eval97.pem/score_15_1.0/eval97.pem.ctm.filt.sys
+
+# Cleaned transcripts
+%WER 19.1 | 728 32834 | 83.1 12.2 4.7 2.2 19.1 85.0 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 20.2 | 728 32834 | 81.9 13.0 5.1 2.1 20.2 87.1 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+# Oracle transcripts
+%WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+%WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
 
 . ./cmd.sh
 . ./path.sh
@@ -79,10 +92,8 @@ steps/train_sat.sh --cmd "$train_cmd" \
 ###############################################################################
 
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp false \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
+  --stage $segment_stage --nj $reco_nj \
+  --max-bad-proportion 0.5 --align-full-hyp false \
   exp/wsj_tri3 data/lang_nosp data/train_long data/train_long/text \
   data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
 
@@ -131,10 +142,8 @@ done
 ###############################################################################
 
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp false \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
+  --stage $segment_stage --nj $reco_nj \
+  --max-bad-proportion 0.5 --align-full-hyp false \
   exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
   data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
 
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
index 2a2ee556d4b..08e0cb0805e 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
@@ -7,22 +7,35 @@ set -e
 set -o pipefail
 
 # This script demonstrates how to re-segment long audios into short segments.
-# The basic idea is to decode with an existing in-domain acoustic model, and a
-# bigram language model built from the reference, and then work out the
-# segmentation from a ctm like file.
+# The basic idea is to decode with an existing out-of-domain WSJ GMM model, 
+# and a 4-gram language model built from the reference, and then work out the
+# segmentation from a ctm like file. This is used to build a stage 1 model
+# that is used to decode and re-segment the long audio again to train a 
+# stage 2 model. This is followed by a clean-up stage to get cleaned 
+# transcripts.
 # This is similar to _a but aligns full hypothesis with reference.
 
-%WER 19.0 | 728 32834 | 83.1 12.5 4.4 2.1 19.0 87.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-%WER 20.2 | 728 32834 | 82.1 13.4 4.5 2.3 20.2 89.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
 
-%WER 19.9 | 728 32834 | 82.3 13.2 4.5 2.3 19.9 88.9 | exp/tri4_2b/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
-%WER 21.2 | 728 32834 | 81.3 14.3 4.4 2.5 21.2 89.8 | exp/tri4_2b/decode_nosp_eval97.pem/score_12_0.0/eval97.pem.ctm.filt.sys
 
+# Results using WSJ models
+%WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+%WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
+
+# Audio-transcript alignment stage 1
 %WER 19.8 | 728 32834 | 82.3 12.7 5.1 2.1 19.8 88.0 | exp/tri4_b/decode_nosp_eval97.pem_rescore/score_14_0.5/eval97.pem.ctm.filt.sys
 %WER 20.9 | 728 32834 | 81.2 13.4 5.4 2.1 20.9 88.7 | exp/tri4_b/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
 
-%WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
-%WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
+# Audio-transcript alignment stage 2
+%WER 19.9 | 728 32834 | 82.3 13.2 4.5 2.3 19.9 88.9 | exp/tri4_2b/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 21.2 | 728 32834 | 81.3 14.3 4.4 2.5 21.2 89.8 | exp/tri4_2b/decode_nosp_eval97.pem/score_12_0.0/eval97.pem.ctm.filt.sys
+
+# Cleaned transcripts
+%WER 19.0 | 728 32834 | 83.1 12.5 4.4 2.1 19.0 87.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+%WER 20.2 | 728 32834 | 82.1 13.4 4.5 2.3 20.2 89.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+# Oracle transcripts
+%WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+%WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
 
 . ./cmd.sh
 . ./path.sh
@@ -30,8 +43,8 @@ set -o pipefail
 segment_stage=-8
 nj=40
 reco_nj=80
-affix=d
-new_affix=2d
+affix=b
+new_affix=2b
 
 . utils/parse_options.sh
 
@@ -80,10 +93,8 @@ steps/train_sat.sh --cmd "$train_cmd" \
 ###############################################################################
 
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp true \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
+  --stage $segment_stage --nj $reco_nj \
+  --max-bad-proportion 0.5 --align-full-hyp true \
   exp/wsj_tri3 data/lang_nosp data/train_long data/train_long/text \
   data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
 
@@ -132,10 +143,8 @@ done
 ###############################################################################
 
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage \
-  --config conf/segment_long_utts.conf --align-full-hyp true \
-  --max-segment-duration 30 --overlap-duration 5 \
-  --num-neighbors-to-search 1 --nj $reco_nj \
+  --stage $segment_stage --nj $reco_nj \
+  --max-bad-proportion 0.5 --align-full-hyp true \
   exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
   data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
 

From 35ade87a247f536ff9c1c5276e589494e7593d78 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 28 Nov 2017 00:18:05 -0500
Subject: [PATCH 24/38] bn: Adding more comments and cleaning up scripts

bn: Adding more comments and cleaning up scripts
---
 .../prepare_1996_csr_hub4_lm_corpus.sh        |   3 +-
 .../data_prep/prepare_na_news_text_corpus.sh  |   4 +-
 .../prepare_na_news_text_supplement.sh        |   4 +-
 egs/hub4_english/s5/local/format_data.sh      |  15 ++-
 egs/hub4_english/s5/local/format_lms.sh       |   3 +-
 egs/hub4_english/s5/local/prepare_dict.sh     |   4 +-
 .../s5/local/run_segmentation_wsj.sh          |   1 +
 egs/hub4_english/s5/local/train_lm.sh         |   8 --
 .../s5/local/tuning/run_segmentation_wsj_a.sh | 100 +++++++++++-------
 .../s5/local/tuning/run_segmentation_wsj_b.sh |  86 +++++++++------
 egs/hub4_english/s5/run.sh                    |  87 +++++++++------
 egs/wsj/s5/utils/data/get_utt2dur.sh          |   8 +-
 12 files changed, 200 insertions(+), 123 deletions(-)
 create mode 120000 egs/hub4_english/s5/local/run_segmentation_wsj.sh

diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
index f0865daa890..f3f9c939e0b 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
@@ -14,7 +14,8 @@ nj=4
 cmd=run.pl
 stage=0
 
-. path.sh
+[ -f ./path.sh ] && . ./path.sh
+
 . utils/parse_options.sh
 
 if [ $# -ne 2 ]; then
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
index c32e48a3d7e..28f67cb1051 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
@@ -6,8 +6,8 @@
 # This script prepares the North American News Text Corpus
 # https://catalog.ldc.upenn.edu/LDC95T21
 
-. cmd.sh
-. path.sh
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
 
 set -e
 set -o pipefail
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
index dd463df46fc..20ce237fcfb 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
@@ -6,8 +6,8 @@
 # This script prepares the North American News Text Supplement Corpus
 # https://catalog.ldc.upenn.edu/LDC98T30
 
-. cmd.sh
-. path.sh
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
 
 set -e
 set -o pipefail
diff --git a/egs/hub4_english/s5/local/format_data.sh b/egs/hub4_english/s5/local/format_data.sh
index 73e58c675d9..dfb8c5b953f 100755
--- a/egs/hub4_english/s5/local/format_data.sh
+++ b/egs/hub4_english/s5/local/format_data.sh
@@ -3,6 +3,9 @@
 # Copyright 2016  Vimal Manohar
 # Apache 2.0.
 
+set -e
+set -o pipefail
+
 echo "$0 $@"  # Print the command line for logging
 
 noise_word="<NOISE>"
@@ -12,6 +15,11 @@ spoken_noise_word="<SPOKEN_NOISE>"
 
 . ./path.sh || exit 1;
 
+if [ $# -ne 0 ]; then
+  echo "Usage: $0"
+  exit 1
+fi
+
 srcdir=data/local/data
 tmpdir=data/local/
 
@@ -114,9 +122,12 @@ for d in eval99_1 eval99_2; do
   cp $srcdir/eval99/${d}_glm data/${d}.pem/glm
 done
 
-for d in train_bn96 eval96 eval96.pem dev96pe dev96ue eval97 eval97.pem \
+for d in train_bn96 train_bn97 eval96 eval96.pem dev96pe dev96ue eval97 eval97.pem \
          eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem; do
   utils/utt2spk_to_spk2utt.pl data/$d/utt2spk > data/$d/spk2utt
-  awk '{print $1" "$1" 1"}' data/${d}/wav.scp > data/${d}/reco2file_and_channel
+  awk '{print $1" "$1" 1"}' data/${d}/wav.scp > \
+    data/${d}/reco2file_and_channel
   utils/fix_data_dir.sh data/${d}
 done
+
+utils/combine_data.sh data/train $train_data_sets
diff --git a/egs/hub4_english/s5/local/format_lms.sh b/egs/hub4_english/s5/local/format_lms.sh
index 834e3d10d0a..1d18209aa60 100755
--- a/egs/hub4_english/s5/local/format_lms.sh
+++ b/egs/hub4_english/s5/local/format_lms.sh
@@ -3,7 +3,7 @@
 # Copyright  2014 Nickolay V. Shmyrev
 # Apache 2.0
 
-if [ -f path.sh ]; then . path.sh; fi
+[ -f ./path.sh ] && . ./path.sh
 
 set -e -o pipefail -u
 
@@ -20,7 +20,6 @@ for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-
 set -e
 
 cp -rT data/lang_nosp/ data/lang_nosp${lang_suffix}
diff --git a/egs/hub4_english/s5/local/prepare_dict.sh b/egs/hub4_english/s5/local/prepare_dict.sh
index c0b2e7c0174..5bbc6266b67 100755
--- a/egs/hub4_english/s5/local/prepare_dict.sh
+++ b/egs/hub4_english/s5/local/prepare_dict.sh
@@ -29,8 +29,8 @@
 # optional_silence.txt
 # silence_phones.txt
 
-. path.sh
-. cmd.sh
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
 
 set -e
 set -o pipefail
diff --git a/egs/hub4_english/s5/local/run_segmentation_wsj.sh b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
new file mode 120000
index 00000000000..d58b4098cc4
--- /dev/null
+++ b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
@@ -0,0 +1 @@
+tuning/run_segmentation_wsj_a.sh
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/train_lm.sh b/egs/hub4_english/s5/local/train_lm.sh
index 19987b906a7..c527208707c 100755
--- a/egs/hub4_english/s5/local/train_lm.sh
+++ b/egs/hub4_english/s5/local/train_lm.sh
@@ -181,14 +181,6 @@ if [ $stage -le 3 ]; then
   # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.02574438024 per word [perplexity = 152.283570813] over 16395.0 words.
 
 fi
-  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
-    $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \
-      get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} 
-
-    cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity'
-  done
-
-  
 
 if [ $stage -le 4 ]; then
   echo "$0: pruning the LM (to larger size)"
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
index edd38d85cf4..b2e01650b92 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
@@ -6,35 +6,44 @@
 set -e
 set -o pipefail
 
-# This script demonstrates how to re-segment long audios into short segments.
-# The basic idea is to decode with an existing out-of-domain WSJ GMM model, 
-# and a 4-gram language model built from the reference, and then work out the
-# segmentation from a ctm like file. This is used to build a stage 1 model
-# that is used to decode and re-segment the long audio again to train a 
-# stage 2 model. This is followed by a clean-up stage to get cleaned 
-# transcripts.
-
-
-
-# Results using WSJ models
-%WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
-%WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
-
-# Audio-transcript alignment stage 1
-%WER 19.8 | 728 32834 | 82.1 12.6 5.3 1.9 19.8 85.9 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_15_1.0/eval97.pem.ctm.filt.sys
-%WER 20.9 | 728 32834 | 81.2 13.5 5.3 2.1 20.9 86.5 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
-
-# Audio-transcript alignment stage 2
-%WER 20.4 | 728 32834 | 81.7 13.1 5.2 2.1 20.4 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-%WER 21.3 | 728 32834 | 80.7 13.7 5.6 2.0 21.3 87.1 | exp/tri4_2a/decode_nosp_eval97.pem/score_15_1.0/eval97.pem.ctm.filt.sys
-
-# Cleaned transcripts
-%WER 19.1 | 728 32834 | 83.1 12.2 4.7 2.2 19.1 85.0 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
-%WER 20.2 | 728 32834 | 81.9 13.0 5.1 2.1 20.2 87.1 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
-
-# Oracle transcripts
-%WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-%WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+# This script demonstrates how to use out-of-domain WSJ models to segment long
+# audio recordings of HUB4 with raw unaligned transcripts into short segments
+# with aligned transcripts for training new ASR models. 
+
+# The overall procedure is as follow:
+# 1) Train a GMM on out-of-domain WSJ corpus
+# 2) Decode broadcast news recordings (HUB4) with WSJ GMM and 4-gram biased LM 
+#    trained on the raw unprocessed transcript. 
+# 3) Use the CTM output to segment the recordings keep the best matched
+#    audio and text.
+# 4) Train an in-domain GMM on the above data. 
+# 5) Repeat steps 2, 3 and 4 using the new in-domain GMM.
+# 6) Re-segment the data retaining only the "clean" part of the data.
+
+# See the script steps/cleanup/segment_long_utterances.sh for details about 
+# audio-transcript alignment (Step 2, 3)
+# See the script steps/cleanup/clean_and_segment_data.sh for details about 
+# cleaning up transcripts (Step 6)
+
+# WSJ models (From step 1)
+# %WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+# %WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
+
+# In-domain GMM (From step 4)
+# %WER 19.8 | 728 32834 | 82.1 12.6 5.3 1.9 19.8 85.9 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_15_1.0/eval97.pem.ctm.filt.sys
+# %WER 20.9 | 728 32834 | 81.2 13.5 5.3 2.1 20.9 86.5 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+# In-domain GMM (From step 5)
+# %WER 20.4 | 728 32834 | 81.7 13.1 5.2 2.1 20.4 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 21.3 | 728 32834 | 80.7 13.7 5.6 2.0 21.3 87.1 | exp/tri4_2a/decode_nosp_eval97.pem/score_15_1.0/eval97.pem.ctm.filt.sys
+
+# GMM trained on cleaned transcripts (From step 6)
+# %WER 19.1 | 728 32834 | 83.1 12.2 4.7 2.2 19.1 85.0 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+# %WER 20.2 | 728 32834 | 81.9 13.0 5.1 2.1 20.2 87.1 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+# Oracle HUB4 transcripts
+# %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
 
 . ./cmd.sh
 . ./path.sh
@@ -48,7 +57,7 @@ new_affix=2a
 . utils/parse_options.sh
 
 ###############################################################################
-## Simulate unsegmented data directory.
+## Simulate unsegmented HUB4 data directory.
 ###############################################################################
 utils/data/convert_data_dir_to_whole.sh data/train data/train_long
 
@@ -59,7 +68,7 @@ steps/compute_cmvn_stats.sh data/train_long \
 utils/fix_data_dir.sh data/train_long
 
 ###############################################################################
-## Train WSJ models.
+## Train GMM on out-of-domain WSJ corpus 
 ###############################################################################
 
 steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
@@ -86,9 +95,9 @@ steps/train_sat.sh --cmd "$train_cmd" \
   data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
 
 ###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on train_si284 (wsj_tri3)
+# Segment long HUB4 recordings and retrieve transcript using 
+# Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3) as seed model for decoding.
 ###############################################################################
 
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
@@ -107,6 +116,10 @@ steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
   exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
 utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
 
+###############################################################################
+# Train new in-domain GMM (tri4_a) on retrieved transcripts.
+###############################################################################
+
 steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
   data/train_reseg_${affix}_spk30sec data/lang_nosp \
   exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
@@ -136,9 +149,9 @@ for dset in eval97.pem; do
 done
 
 ###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on Smith-Waterman alignment.
-# Use a SAT model trained on tri4_a
+# Segment long HUB4 recordings and retrieve transcript using 
+# Smith-Waterman alignment.
+# Use in-domain SAT model (tri4_a) as seed model for decoding.
 ###############################################################################
 
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
@@ -151,6 +164,10 @@ steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
   exp/make_mfcc/train_reseg_${new_affix} mfcc
 utils/fix_data_dir.sh data/train_reseg_${new_affix}
 
+###############################################################################
+# Train new in-domain GMM (tri4_2a) on retrieved transcripts.
+###############################################################################
+
 steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
   data/train_reseg_${new_affix} data/lang_nosp \
   exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
@@ -173,6 +190,11 @@ for dset in eval97.pem; do
     exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
 done
 
+###############################################################################
+# Cleanup transcripts
+# Use in-domain SAT model (tri4_2a) as seed model for decoding.
+###############################################################################
+
 cleanup_stage=-1
 cleanup_affix=cleaned
 srcdir=exp/tri4_${new_affix}
@@ -185,6 +207,10 @@ steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
   data/train_reseg_${new_affix} data/lang_nosp \
   $srcdir $dir $cleaned_data
 
+###############################################################################
+# Train new in-domain GMM (tri4_2a) on cleaned-up transcripts.
+###############################################################################
+
 steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
   $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
 
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
index 08e0cb0805e..d8ee7e9fc6b 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
@@ -6,36 +6,43 @@
 set -e
 set -o pipefail
 
-# This script demonstrates how to re-segment long audios into short segments.
-# The basic idea is to decode with an existing out-of-domain WSJ GMM model, 
-# and a 4-gram language model built from the reference, and then work out the
-# segmentation from a ctm like file. This is used to build a stage 1 model
-# that is used to decode and re-segment the long audio again to train a 
-# stage 2 model. This is followed by a clean-up stage to get cleaned 
-# transcripts.
-# This is similar to _a but aligns full hypothesis with reference.
+# This script demonstrates how to use out-of-domain WSJ models to segment long
+# audio recordings of HUB4 with raw unaligned transcripts into short segments
+# with aligned transcripts for training new ASR models. 
 
+# The overall procedure is as follow:
+# 1) Train a GMM on out-of-domain WSJ corpus
+# 2) Decode broadcast news recordings (HUB4) with WSJ GMM and 4-gram biased LM 
+#    trained on the raw unprocessed transcript. 
+# 3) Use the CTM output to segment the recordings into aligned audio and 
+#    text.
+# 4) Train an in-domain GMM on the above data. 
+# 5) Repeat steps 2, 3 and 4 using the new in-domain GMM.
+# 6) Re-segment the data retaining only the "clean" part of the data.
 
+# This is similar to _a but aligns full hypothesis of segment with reference
+# (not just the best matching subset of hypothesis).
 
-# Results using WSJ models
-%WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
-%WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
 
-# Audio-transcript alignment stage 1
-%WER 19.8 | 728 32834 | 82.3 12.7 5.1 2.1 19.8 88.0 | exp/tri4_b/decode_nosp_eval97.pem_rescore/score_14_0.5/eval97.pem.ctm.filt.sys
-%WER 20.9 | 728 32834 | 81.2 13.4 5.4 2.1 20.9 88.7 | exp/tri4_b/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+# WSJ models (From step 1)
+# %WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+# %WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
 
-# Audio-transcript alignment stage 2
-%WER 19.9 | 728 32834 | 82.3 13.2 4.5 2.3 19.9 88.9 | exp/tri4_2b/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
-%WER 21.2 | 728 32834 | 81.3 14.3 4.4 2.5 21.2 89.8 | exp/tri4_2b/decode_nosp_eval97.pem/score_12_0.0/eval97.pem.ctm.filt.sys
+# In-domain GMM (From step 4)
+# %WER 19.8 | 728 32834 | 82.3 12.7 5.1 2.1 19.8 88.0 | exp/tri4_b/decode_nosp_eval97.pem_rescore/score_14_0.5/eval97.pem.ctm.filt.sys
+# %WER 20.9 | 728 32834 | 81.2 13.4 5.4 2.1 20.9 88.7 | exp/tri4_b/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
 
-# Cleaned transcripts
-%WER 19.0 | 728 32834 | 83.1 12.5 4.4 2.1 19.0 87.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-%WER 20.2 | 728 32834 | 82.1 13.4 4.5 2.3 20.2 89.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+# In-domain GMM (From step 5)
+# %WER 19.9 | 728 32834 | 82.3 13.2 4.5 2.3 19.9 88.9 | exp/tri4_2b/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+# %WER 21.2 | 728 32834 | 81.3 14.3 4.4 2.5 21.2 89.8 | exp/tri4_2b/decode_nosp_eval97.pem/score_12_0.0/eval97.pem.ctm.filt.sys
 
-# Oracle transcripts
-%WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-%WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+# GMM trained on cleaned transcripts (From step 6)
+# %WER 19.0 | 728 32834 | 83.1 12.5 4.4 2.1 19.0 87.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 20.2 | 728 32834 | 82.1 13.4 4.5 2.3 20.2 89.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+# Oracle HUB4 transcripts
+# %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
 
 . ./cmd.sh
 . ./path.sh
@@ -49,7 +56,7 @@ new_affix=2b
 . utils/parse_options.sh
 
 ###############################################################################
-## Simulate unsegmented data directory.
+## Simulate unsegmented HUB4 data directory.
 ###############################################################################
 utils/data/convert_data_dir_to_whole.sh data/train data/train_long
 
@@ -60,7 +67,7 @@ steps/compute_cmvn_stats.sh data/train_long \
 utils/fix_data_dir.sh data/train_long
 
 ###############################################################################
-## Train WSJ models.
+## Train GMM on out-of-domain WSJ corpus 
 ###############################################################################
 
 steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
@@ -87,9 +94,9 @@ steps/train_sat.sh --cmd "$train_cmd" \
   data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
 
 ###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on modified Levenshtein alignment.
-# Use a SAT model trained on train_si284 (wsj_tri3)
+# Segment long HUB4 recordings and retrieve transcript using 
+# modified Levenshtein alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3) as seed model for decoding.
 ###############################################################################
 
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
@@ -108,6 +115,10 @@ steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
   exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
 utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
 
+###############################################################################
+# Train new in-domain GMM (tri4_b) on retrieved transcripts.
+###############################################################################
+
 steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
   data/train_reseg_${affix}_spk30sec data/lang_nosp \
   exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
@@ -137,9 +148,9 @@ for dset in eval97.pem; do
 done
 
 ###############################################################################
-# Segment long recordings using TF-IDF retrieval of reference text 
-# for uniformly segmented audio chunks based on modified Levenshtein alignment.
-# Use a SAT model trained on tri4_a
+# Segment long HUB4 recordings and retrieve transcript using 
+# modified Levenshtein alignment.
+# Use in-domain SAT model (tri4_b) as seed model for decoding.
 ###############################################################################
 
 steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
@@ -152,6 +163,10 @@ steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
   exp/make_mfcc/train_reseg_${new_affix} mfcc
 utils/fix_data_dir.sh data/train_reseg_${new_affix}
 
+###############################################################################
+# Train new in-domain GMM (tri4_2b) on retrieved transcripts.
+###############################################################################
+
 steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
   data/train_reseg_${new_affix} data/lang_nosp \
   exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
@@ -174,6 +189,11 @@ for dset in eval97.pem; do
     exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
 done
 
+###############################################################################
+# Cleanup transcripts
+# Use in-domain SAT model (tri4_2b) as seed model for decoding.
+###############################################################################
+
 cleanup_stage=-1
 cleanup_affix=cleaned
 srcdir=exp/tri4_${new_affix}
@@ -186,6 +206,10 @@ steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
   data/train_reseg_${new_affix} data/lang_nosp \
   $srcdir $dir $cleaned_data
 
+###############################################################################
+# Train new in-domain GMM (tri4_2b) on cleaned-up transcripts.
+###############################################################################
+
 steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
   $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
 
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
index 7e6a4f30b9e..0c8703211cf 100755
--- a/egs/hub4_english/s5/run.sh
+++ b/egs/hub4_english/s5/run.sh
@@ -13,40 +13,48 @@ set -o pipefail
 mfccdir=`pwd`/mfcc
 nj=40
 
+# Training corpora
+
 # 1996 English Broadcast News Train (HUB4)
-HUB4_96_Train_Transcripts=/export/corpora/LDC/LDC97T22/hub4_eng_train_trans
-HUB4_96_Train_Speech=/export/corpora/LDC/LDC97S44
+hub4_96_train_transcripts=/export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+hub4_96_train_speech=/export/corpora/LDC/LDC97S44
 # 1997 English Broadcast News Train (HUB4)
-HUB4_97_Train_Transcripts=/export/corpora/LDC/LDC98T28/hub4e97_trans_980217
-HUB4_97_Train_Speech=/export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+hub4_97_train_transcripts=/export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+hub4_97_train_speech=/export/corpora/LDC/LDC98S71/97_eng_bns_hub4
 # 1996 CSR HUB4 Language Model
-CSR_HUB4_LM=/export/corpora/LDC/LDC98T31/1996_csr_hub4_model
+csr_hub4_lm=/export/corpora/LDC/LDC98T31/1996_csr_hub4_model
 # 1995 CSR-IV HUB4 corpus
-CSR95_HUB4=/export/corpora5/LDC/LDC96S31/csr95_hub4
+csr95_hub4=/export/corpora5/LDC/LDC96S31/csr95_hub4
 # North American News Text Corpus
-NA_Text=/export/corpora/LDC/LDC95T21
+NA_text=/export/corpora/LDC/LDC95T21
 # North American News Text Supplement Corpus
-NA_Text_Supp=/export/corpura/LCD/LDC98T30/northam_news_txt_sup
+NA_text_supp=/export/corpura/LCD/LDC98T30/northam_news_txt_sup
+
+# Test corpora
+
 # 1996 English Broadcast News Dev and Eval (HUB4)
-HUB4_96_Eval=/export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval
+hub4_96_eval=/export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval
 # 1997 HUB4 English Evaluation corpus
-HUB4_97_Eval=/export/corpora/LDC/LDC2002S11/hub4e_97
+hub4_97_eval=/export/corpora/LDC/LDC2002S11/hub4e_97
 # 1998 HUB4 Broadcast News Evaluation English Test Material
-HUB4_98_Eval=/export/corpora/LDC/LDC2000S86
+hub4_98_eval=/export/corpora/LDC/LDC2000S86
 # 1999 HUB4 Broadcast News Evaluation English Test Material
-HUB4_99_Eval=/export/corpora5/LDC/LDC2000S88/hub4_1999 
+hub4_99_eval=/export/corpora5/LDC/LDC2000S88/hub4_1999 
+
+# Test sets used -- Uncomment and keep only test sets needed
+test_sets="eval97.pem"
+# test_sets="dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem"
 
 # Prepare 1996 English Broadcast News Train (HUB4)
 local/data_prep/prepare_1996_bn_data.sh \
-  $HUB4_96_Train_Transcripts \
-  $HUB4_96_Train_Speech \
-  /export/corpora/LDC/LDC97S44 \
+  $hub4_96_train_transcripts \
+  $hub4_96_train_speech \
   data/local/data/train_bn96
 
 # Prepare 1997 English Broadcast News Train (HUB4)
 local/data_prep/prepare_1997_bn_data.sh \
-  $HUB4_97_Train_Transcripts \
-  $HUB4_97_Train_Speech \
+  $hub4_97_train_transcripts \
+  $hub4_97_train_speech \
   data/local/data/train_bn97
 
 # Install Beautiful Soup 4 python package
@@ -56,15 +64,15 @@ if [ ! -d tools/beautifulsoup4 ]; then
 fi
 export PYTHONPATH=$PWD/tools/beautifulsoup4:$PYTHONPATH
 
-if [ ! -f $CSR_HUB4_LM/utils.tar ]; then
+if [ ! -f $csr_hub4_lm/utils.tar ]; then
   echo "Expected CSR-IV utils.tar to be found"
   exit 1
 fi
 
 mkdir -p tools/csr4_utils
 (
-cd tools/csr4_utils
-tar -xvf $CSR_HUB4_LM/utils.tar
+  cd tools/csr4_utils
+  tar -xvf $csr_hub4_lm/utils.tar
 )
 
 chmod a+w tools/csr4_utils
@@ -72,37 +80,38 @@ patch -u -d tools/csr4_utils -p3 < local/data_prep/csr4_utils.patch
 
 # Prepare 1995 CSR-IV HUB4 corpus
 local/data_prep/prepare_1995_csr_hub4_corpus.sh \
-  $CSR95_HUB4 data/local/data/csr95_hub4
+  $csr95_hub4 data/local/data/csr95_hub4
 
 # Prepare North American News Text Corpus
 local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
-   $NA_Text data/local/data/na_news
+   $NA_text data/local/data/na_news
 
 # Prepare North American News Text Supplement Corpus
 local/data/prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
-  $NA_Text_Supp data/local/data/na_news_supp
+  $NA_text_supp data/local/data/na_news_supp
 
 # Prepare 1996 CSR HUB4 Language Model
 local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
-   $CSR_HUB4_LM data/local/data/csr96_hub4
+   $csr_hub4_lm data/local/data/csr96_hub4
 
 # Prepare 1996 English Broadcast News Dev and Eval (HUB4)
 local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
-  $HUB4_96_Eval \
+  $hub4_96_eval \
   data/local/data/hub4_96_dev_eval
 
 # Prepare 1997 HUB4 English Evaluation corpus
 local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \
-  $HUB4_97_Eval data/local/data/eval97
+  $hub4_97_eval data/local/data/eval97
 
 # Prepare 1998 HUB4 Broadcast News Evaluation English Test Material
 local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \
-  $HUB4_98_Eval data/local/data/eval98
+  $hub4_98_eval data/local/data/eval98
 
 # Prepare 1999 HUB4 Broadcast News Evaluation English Test Material
 local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \
-  $HUB4_99_Eval data/local/data/eval99
+  $hub4_99_eval data/local/data/eval99
 
+# Format data. 1996 HUB4 is required. Everything else is optional.
 local/format_data.sh 
 
 local/train_lm.sh 
@@ -115,7 +124,7 @@ utils/prepare_lang.sh data/local/dict_nosp \
 
 local/format_lms.sh --local-lm-dir data/local/local_lm
 
-for x in train dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem; do 
+for x in train $test_sets; do 
   this_nj=$(cat data/$x/utt2spk | wc -l)
   if [ $this_nj -gt 30 ]; then
     this_nj=30
@@ -158,7 +167,7 @@ steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
 utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
 
 (
-for dset in eval97.pem; do
+for dset in $test_sets; do
   this_nj=`cat data/$dset/spk2utt | wc -l`
   if [ $this_nj -gt 20 ]; then
     this_nj=20
@@ -180,7 +189,8 @@ steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
 
 utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
 
-for dset in eval97.pem; do
+(
+for dset in $test_sets; do
   this_nj=`cat data/$dset/spk2utt | wc -l`
   if [ $this_nj -gt 20 ]; then
     this_nj=20
@@ -192,6 +202,21 @@ for dset in eval97.pem; do
     data/${dset} exp/tri4/decode_nosp_${dset} \
     exp/tri4/decode_nosp_${dset}_rescore
 done
+) &
+
+# The following demonstrates how to use out-of-domain WSJ models to segment long
+# audio recordings of HUB4 with raw unaligned transcripts into short segments
+# with aligned transcripts for training new ASR models. 
+
+# First run the data preparation stages in WSJ run.sh
+wsj_base=../../wsj/s5   # Change this to the WSJ base directory
+
+# We copy the prepared data to the current directory
+utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
+utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
+utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
+
+local/run_segmentation_wsj.sh 
 
 wait
 exit 0
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
index 29ba86bf385..e6a344d7d50 100755
--- a/egs/wsj/s5/utils/data/get_utt2dur.sh
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -86,11 +86,9 @@ elif [ -f $data/wav.scp ]; then
     sdata=$data/split${nj}utt
 
     $cmd JOB=1:$nj $data/log/get_durations.JOB.log \
-      wav-to-duration --read-entire-file=$read_entire_file scp:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || 
-      {
-        echo "$0: there was a problem getting the durations";
-        exit 1;
-      }
+      wav-to-duration --read-entire-file=$read_entire_file \
+      scp:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || \
+        { echo "$0: there was a problem getting the durations"; exit 1; }
 
     for n in `seq $nj`; do
       cat $sdata/$n/utt2dur

From b7c0c3b9440fb106051984df68b0e5ee8d76157f Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 28 Nov 2017 10:19:02 -0500
Subject: [PATCH 25/38] Adding results etc.

---
 egs/hub4_english/s5/RESULTS | 2 ++
 egs/hub4_english/s5/run.sh  | 3 +++
 2 files changed, 5 insertions(+)
 create mode 100644 egs/hub4_english/s5/RESULTS

diff --git a/egs/hub4_english/s5/RESULTS b/egs/hub4_english/s5/RESULTS
new file mode 100644
index 00000000000..6028a01ce64
--- /dev/null
+++ b/egs/hub4_english/s5/RESULTS
@@ -0,0 +1,2 @@
+%WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+%WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
index 0c8703211cf..48c53e12a26 100755
--- a/egs/hub4_english/s5/run.sh
+++ b/egs/hub4_english/s5/run.sh
@@ -204,6 +204,9 @@ for dset in $test_sets; do
 done
 ) &
 
+# %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
 # The following demonstrates how to use out-of-domain WSJ models to segment long
 # audio recordings of HUB4 with raw unaligned transcripts into short segments
 # with aligned transcripts for training new ASR models. 

From 8776b5b51699049d62591f538335080c35b7885e Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 28 Nov 2017 10:25:08 -0500
Subject: [PATCH 26/38] bn: adding stages

---
 egs/hub4_english/s5/run.sh | 294 +++++++++++++++++++++----------------
 1 file changed, 164 insertions(+), 130 deletions(-)

diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
index 48c53e12a26..8844a943fca 100755
--- a/egs/hub4_english/s5/run.sh
+++ b/egs/hub4_english/s5/run.sh
@@ -5,13 +5,14 @@
 
 # See README.txt for more info on data required.
 
-. cmd.sh
-. path.sh
+. ./cmd.sh
+. ./path.sh
 
 set -o pipefail
 
 mfccdir=`pwd`/mfcc
 nj=40
+stage=-1
 
 # Training corpora
 
@@ -45,17 +46,19 @@ hub4_99_eval=/export/corpora5/LDC/LDC2000S88/hub4_1999
 test_sets="eval97.pem"
 # test_sets="dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem"
 
-# Prepare 1996 English Broadcast News Train (HUB4)
-local/data_prep/prepare_1996_bn_data.sh \
-  $hub4_96_train_transcripts \
-  $hub4_96_train_speech \
-  data/local/data/train_bn96
-
-# Prepare 1997 English Broadcast News Train (HUB4)
-local/data_prep/prepare_1997_bn_data.sh \
-  $hub4_97_train_transcripts \
-  $hub4_97_train_speech \
-  data/local/data/train_bn97
+if [ $stage -le 0 ]; then
+  # Prepare 1996 English Broadcast News Train (HUB4)
+  local/data_prep/prepare_1996_bn_data.sh \
+    $hub4_96_train_transcripts \
+    $hub4_96_train_speech \
+    data/local/data/train_bn96
+
+  # Prepare 1997 English Broadcast News Train (HUB4)
+  local/data_prep/prepare_1997_bn_data.sh \
+    $hub4_97_train_transcripts \
+    $hub4_97_train_speech \
+    data/local/data/train_bn97
+fi
 
 # Install Beautiful Soup 4 python package
 if [ ! -d tools/beautifulsoup4 ]; then
@@ -64,145 +67,174 @@ if [ ! -d tools/beautifulsoup4 ]; then
 fi
 export PYTHONPATH=$PWD/tools/beautifulsoup4:$PYTHONPATH
 
-if [ ! -f $csr_hub4_lm/utils.tar ]; then
-  echo "Expected CSR-IV utils.tar to be found"
-  exit 1
-fi
+if [ $stage -le 1 ]; then
+  if [ ! -f $csr_hub4_lm/utils.tar ]; then
+    echo "Expected CSR-IV utils.tar to be found"
+    exit 1
+  fi
 
-mkdir -p tools/csr4_utils
-(
-  cd tools/csr4_utils
-  tar -xvf $csr_hub4_lm/utils.tar
-)
+  mkdir -p tools/csr4_utils
+  (
+    cd tools/csr4_utils
+    tar -xvf $csr_hub4_lm/utils.tar
+  )
 
-chmod a+w tools/csr4_utils
-patch -u -d tools/csr4_utils -p3 < local/data_prep/csr4_utils.patch
+  chmod a+w tools/csr4_utils
+  patch -u -d tools/csr4_utils -p3 < local/data_prep/csr4_utils.patch
+fi
 
-# Prepare 1995 CSR-IV HUB4 corpus
-local/data_prep/prepare_1995_csr_hub4_corpus.sh \
-  $csr95_hub4 data/local/data/csr95_hub4
+if [ $stage -le 2 ]; then
+  # Prepare 1995 CSR-IV HUB4 corpus
+  local/data_prep/prepare_1995_csr_hub4_corpus.sh \
+    $csr95_hub4 data/local/data/csr95_hub4
 
-# Prepare North American News Text Corpus
-local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
-   $NA_text data/local/data/na_news
+  # Prepare North American News Text Corpus
+  local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
+     $NA_text data/local/data/na_news
 
-# Prepare North American News Text Supplement Corpus
-local/data/prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
-  $NA_text_supp data/local/data/na_news_supp
+  # Prepare North American News Text Supplement Corpus
+  local/data/prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
+    $NA_text_supp data/local/data/na_news_supp
 
-# Prepare 1996 CSR HUB4 Language Model
-local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
-   $csr_hub4_lm data/local/data/csr96_hub4
+  # Prepare 1996 CSR HUB4 Language Model
+  local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
+     $csr_hub4_lm data/local/data/csr96_hub4
+fi
 
-# Prepare 1996 English Broadcast News Dev and Eval (HUB4)
-local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
-  $hub4_96_eval \
-  data/local/data/hub4_96_dev_eval
+if [ $stage -le 3 ]; then
+  # Prepare 1996 English Broadcast News Dev and Eval (HUB4)
+  local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
+    $hub4_96_eval \
+    data/local/data/hub4_96_dev_eval
 
-# Prepare 1997 HUB4 English Evaluation corpus
-local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \
-  $hub4_97_eval data/local/data/eval97
+  # Prepare 1997 HUB4 English Evaluation corpus
+  local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \
+    $hub4_97_eval data/local/data/eval97
 
-# Prepare 1998 HUB4 Broadcast News Evaluation English Test Material
-local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \
-  $hub4_98_eval data/local/data/eval98
+  # Prepare 1998 HUB4 Broadcast News Evaluation English Test Material
+  local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \
+    $hub4_98_eval data/local/data/eval98
 
-# Prepare 1999 HUB4 Broadcast News Evaluation English Test Material
-local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \
-  $hub4_99_eval data/local/data/eval99
+  # Prepare 1999 HUB4 Broadcast News Evaluation English Test Material
+  local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \
+    $hub4_99_eval data/local/data/eval99
+fi
 
-# Format data. 1996 HUB4 is required. Everything else is optional.
-local/format_data.sh 
+if [ $stage -le 4 ]; then
+  local/format_data.sh 
+fi
 
-local/train_lm.sh 
+if [ $stage -le 5 ]; then
+  local/train_lm.sh 
+fi
 
-local/prepare_dict.sh --dict-suffix "_nosp" \
-  data/local/local_lm/data/work/wordlist
+if [ $stage -le 6 ]; then
+  local/prepare_dict.sh --dict-suffix "_nosp" \
+    data/local/local_lm/data/work/wordlist
 
-utils/prepare_lang.sh data/local/dict_nosp \
-  "<unk>" data/local/lang_tmp_nosp data/lang_nosp
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_tmp_nosp data/lang_nosp
+fi
 
-local/format_lms.sh --local-lm-dir data/local/local_lm
+if [ $stage -le 7 ]; then
+  local/format_lms.sh --local-lm-dir data/local/local_lm
+fi
 
-for x in train $test_sets; do 
-  this_nj=$(cat data/$x/utt2spk | wc -l)
-  if [ $this_nj -gt 30 ]; then
-    this_nj=30
-  fi
+if [ $stage -le 8 ]; then
+  for x in train $test_sets; do 
+    this_nj=$(cat data/$x/utt2spk | wc -l)
+    if [ $this_nj -gt 30 ]; then
+      this_nj=30
+    fi
+
+    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj $this_nj \
+      --cmd "$train_cmd" \
+      data/$x exp/make_mfcc $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 9 ]; then
+  utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
+  utils/subset_data_dir.sh data/train 2000 data/train_2k
 
-  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj $this_nj \
-    --cmd "$train_cmd" \
-    data/$x exp/make_mfcc $mfccdir
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir
-  utils/fix_data_dir.sh data/$x
-done
+  # Note: the --boost-silence option should probably be omitted by default
+  # for normal setups.  It doesn't always help. [it's to discourage non-silence
+  # models from modeling silence.]
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_1kshort data/lang_nosp exp/mono0a
+fi
 
-utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
-utils/subset_data_dir.sh data/train 2000 data/train_2k
+if [ $stage -le 10 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_2k data/lang_nosp exp/mono0a exp/mono0a_ali
 
-# Note: the --boost-silence option should probably be omitted by default
-# for normal setups.  It doesn't always help. [it's to discourage non-silence
-# models from modeling silence.]
-steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_1kshort data/lang_nosp exp/mono0a
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
+    data/train_2k data/lang_nosp exp/mono0a_ali exp/tri1
+fi
 
-steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_2k data/lang_nosp exp/mono0a exp/mono0a_ali
+if [ $stage -le 10 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri1 exp/tri1_ali
 
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
-  data/train_2k data/lang_nosp exp/mono0a_ali exp/tri1
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 15000 \
+    data/train data/lang_nosp exp/tri1_ali exp/tri2
+fi
 
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train data/lang_nosp exp/tri1 exp/tri1_ali
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri2 exp/tri2_ali
 
-steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 15000 \
-  data/train data/lang_nosp exp/tri1_ali exp/tri2
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train data/lang_nosp exp/tri2_ali exp/tri3
+fi
 
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train data/lang_nosp exp/tri2 exp/tri2_ali
+if [ $stage -le 12 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+
+  (
+  for dset in $test_sets; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri3/decode_nosp_${dset} \
+      exp/tri3/decode_nosp_${dset}_rescore
+  done
+  ) &
+fi
 
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train data/lang_nosp exp/tri2_ali exp/tri3
+if [ $stage -le 13 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri3 exp/tri3_ali
 
-utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+  steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+    data/train data/lang_nosp exp/tri3_ali exp/tri4
+fi
 
-(
-for dset in $test_sets; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri3/decode_nosp_${dset} \
-    exp/tri3/decode_nosp_${dset}_rescore
-done
-) &
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train data/lang_nosp exp/tri3 exp/tri3_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train data/lang_nosp exp/tri3_ali exp/tri4
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
-
-(
-for dset in $test_sets; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4/graph_nosp data/$dset exp/tri4/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4/decode_nosp_${dset} \
-    exp/tri4/decode_nosp_${dset}_rescore
-done
-) &
+if [ $stage -le 14 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
+
+  (
+  for dset in $test_sets; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4/graph_nosp data/$dset exp/tri4/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4/decode_nosp_${dset} \
+      exp/tri4/decode_nosp_${dset}_rescore
+  done
+  ) &
+fi
 
 # %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
 # %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
@@ -214,12 +246,14 @@ done
 # First run the data preparation stages in WSJ run.sh
 wsj_base=../../wsj/s5   # Change this to the WSJ base directory
 
-# We copy the prepared data to the current directory
-utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
-utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
-utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
+if [ $stage -le 15 ]; then
+  # We copy the prepared data to the current directory
+  utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
+  utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
+  utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
 
-local/run_segmentation_wsj.sh 
+  local/run_segmentation_wsj.sh 
+fi
 
 wait
 exit 0

From 191ae0a6e5db19d316c82a78c746bcd56cc2d7da Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 4 Dec 2017 22:02:49 -0500
Subject: [PATCH 27/38] bn: Various bug fixes

---
 .../s5/local/data_prep/csr4_utils.patch       |  6 +-
 .../s5/local/data_prep/hub4_utils.py          |  3 +
 .../data_prep/parse_sgm_1997_hub4_eng.pl      |  0
 .../local/data_prep/prepare_1996_bn_data.sh   |  4 +-
 .../local/data_prep/prepare_1997_bn_data.sh   |  4 +-
 .../prepare_1997_hub4_bn_eng_eval.sh          |  3 +-
 .../prepare_1998_hub4_bn_eng_eval.sh          | 23 +++--
 .../prepare_1999_hub4_bn_eng_eval.sh          | 10 +-
 .../data_prep/prepare_na_news_text_corpus.sh  | 23 +++--
 .../prepare_na_news_text_supplement.sh        | 25 +++--
 .../process_1996_csr_hub4_lm_filelist.py      | 72 +++++++-------
 .../local/data_prep/process_na_news_text.py   | 95 +++++++++++++------
 egs/hub4_english/s5/local/format_data.sh      |  6 +-
 egs/hub4_english/s5/local/train_lm.sh         | 10 +-
 egs/hub4_english/s5/path.sh                   |  4 +-
 egs/hub4_english/s5/run.sh                    | 55 ++++++-----
 16 files changed, 207 insertions(+), 136 deletions(-)
 mode change 100644 => 100755 egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl

diff --git a/egs/hub4_english/s5/local/data_prep/csr4_utils.patch b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
index 7c278f0b451..1b7dcb4ec1b 100644
--- a/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
+++ b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
@@ -731,7 +731,7 @@ diff -Naur tools/csr4_utils/process_filelist.sh local/data_prep/csr_hub4_utils/p
 +filelist=$1
 +dir=$2
 +
-+export PATH=local/data_prep/csr_hub4_utils:$PATH
++export PATH=$PATH:tools/csr4_utils
 +
 +for file in `cat $filelist`; do
 +	BASENM=`basename $file`
@@ -741,8 +741,8 @@ diff -Naur tools/csr4_utils/process_filelist.sh local/data_prep/csr_hub4_utils/p
 +  gunzip -c $file | pare-sgml.perl | \
 +    bugproc.perl | \
 +    numhack.perl | \
-+    numproc.perl -xlocal/data_prep/csr_hub4_utils/num_excp | \
-+    abbrproc.perl local/data_prep/csr_hub4_utils/abbrlist | \
++    numproc.perl -xtools/csr4_utils/num_excp | \
++    abbrproc.perl tools/csr4_utils/abbrlist | \
 +    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
 +	echo "Done with $BASENM."
 +done
diff --git a/egs/hub4_english/s5/local/data_prep/hub4_utils.py b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
index afb92b6dbb1..4ee9eab1c7e 100644
--- a/egs/hub4_english/s5/local/data_prep/hub4_utils.py
+++ b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
@@ -44,6 +44,9 @@ def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
     The CMU segmentation has the following format:
     <file> <channel> <speaker> <start-time> <end-time> <condition>
 
+    e.g.:
+    h4e_98_1 1 F0-0000     0.00    28.22 F0
+
     We force the channel to be 1 and take the file-id to be the recording-id.
     """
     line = line.strip()
diff --git a/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
old mode 100644
new mode 100755
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
index 3decb3a268b..ea4e5699ce3 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
@@ -32,8 +32,8 @@ if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
   exit 1
 fi
 
-local/data_prep/parse_sgm.pl $out/text.list > $out/transcripts.txt 2> \
-  $out/parse_sgml.log || exit 1
+local/data_prep/parse_sgm_1996_hub4_eng.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
 
 if [ ! -s $out/transcripts.txt ]; then
   echo "$0: Could not parse SGML files in $out/text.list"
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh b/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
index 58e8a4b2eef..5f049f7831c 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
@@ -32,8 +32,8 @@ if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
   exit 1
 fi
 
-local/data_prep/parse_sgm.pl $out/text.list > $out/transcripts.txt 2> \
-  $out/parse_sgml.log || exit 1
+local/data_prep/parse_sgm_1997_hub4_eng.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
 
 if [ ! -s $out/transcripts.txt ]; then
   echo "$0: Could not parse SGML files in $out/text.list"
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
index 8ef0817065f..1a0f6f8d372 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
@@ -50,7 +50,8 @@ with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
     if tup is not None:
       segments_line, utt2spk_line = tup
       s_f.write("{0}\n".format(segments_line))
-      u_f.write("{0}\n".format(utt2spk_line))' $dir/segments.pem $dir/utt2spk.pem
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/segments.pem $dir/utt2spk.pem
  
 export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
 sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
index ccefc3dcd66..520472657d9 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
@@ -4,9 +4,9 @@
 # Apache 2.0.
 
 # This script prepares 1998 HUB4 Broadcast News Evaluation English Test Material
-# https://catalog.ldc.upenn.edu/LDC2000S86 
+# https://catalog.ldc.upenn.edu/LDC2000S86
 
-set -e 
+set -e
 set -o pipefail
 
 if [ $# -ne 2 ]; then
@@ -21,7 +21,7 @@ dir=$2
 mkdir -p $dir
 
 if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then
-  echo "$0: Invalid SOURCE-DIR for LDC2000S86 corpus"
+  echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC2000S86 corpus"
   exit 1
 fi
 
@@ -32,20 +32,25 @@ import hub4_utils
 uem = sys.argv[1]
 reco, ext = os.path.splitext(os.path.basename(uem))
 for line in open(uem).readlines():
-  line = line.strip()
-  print (parse_uem_line(line))' $uem
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $uem
 done > $dir/segments
 awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
 
 cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \
   python -c '
 import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
 with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
   for line in sys.stdin.readlines():
-    segments_line, utt2spk_line = parse_cmu_seg_line(reco, line)
-    s_f.write("{0}\n".format(segments_line))
-    u_f.write("{0}\n".format(utt2spk_line))' \
-      $dir/segments.pem $dir/utt2spk.pem
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/segments.pem $dir/utt2spk.pem
  
 export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
 sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
index 8a6d4d4b8ae..2d6a37228db 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
@@ -29,9 +29,9 @@ export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
 sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
 
 for f in bn99en_1 bn99en_2; do 
-  if [ $f == "bn99en_1" ]; then
+  if [ "$f" == "bn99en_1" ]; then
     affix=eval99_1
-  elif [ $z == "bn99en_2" ]; then
+  else
     affix=eval99_2
   fi
 
@@ -60,12 +60,12 @@ with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
       segments_line, utt2spk_line = tup
       s_f.write("{0}\n".format(segments_line))
       u_f.write("{0}\n".format(utt2spk_line))' \
-        $dir/${affix}_pem_segments $dir/${affix}_pem_uttspk
+        $dir/${affix}_pem_segments $dir/${affix}_pem_utt2spk
   
-  echo "$f $sph2pipe -f wav $SOURCE_DIR/bnews_99/$f.sph |" > ${affix}_wav_scp
+  echo "$f $sph2pipe -f wav $SOURCE_DIR/bnews_99/$f.sph |" > ${dir}/${affix}_wav_scp
 done 
 
-cp $SOURCE_DIR/bnews_99/en981118.glm $dir/eval98_2_glm
+cp $SOURCE_DIR/bnews_99/en981118.glm $dir/eval99_1_glm
 cp $SOURCE_DIR/bnews_99/bn99en_1.stm $dir/eval99_1_stm
 
 cp $SOURCE_DIR/bnews_99/en991231.glm $dir/eval99_2_glm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
index 28f67cb1051..9835d69a37e 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
@@ -27,12 +27,16 @@ fi
 SOURCE_DIR=$1
 dir=$2
 
+dir_list=
+
+rm -f $dir/.error 2>/dev/null
+
 for x in $SOURCE_DIR/*/*/*; do
   year=`basename $x`
   newspaper=`basename $(dirname $x)`
   d=$dir/${newspaper}_${year}
 
-  mkdir -p $d
+  dir_list="$dir_list $d"
 
   list_file=$d/articles.list
   ls $x/*.gz > $list_file
@@ -42,13 +46,18 @@ for x in $SOURCE_DIR/*/*/*; do
   eval utils/split_scp.pl $d/articles.list \
     $d/split$nj/articles.list.{`seq -s, $nj`}
 
-  (
   $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \
-    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB - \| \
-    gzip -c '>' $d/corpus.JOB.gz  || exit 1
-  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
-  rm $d/corpus.*.gz
-  ) &
+    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB \
+    $d/corpus.JOB.gz || touch $dir/.error &
 done
 
 wait
+
+if [ -f $dir/.error ]; then
+  echo "$0: Failed to process files."
+fi
+
+for d in $dir_list; do
+  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
+  rm $d/corpus.*.gz
+done
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
index 20ce237fcfb..f7f810c2326 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
@@ -27,12 +27,16 @@ fi
 SOURCE_DIR=$1
 dir=$2
 
+dir_list=
+
+rm -f $dir/.error 2>/dev/null
+
 for x in $SOURCE_DIR/nyt/*/ $SOURCE_DIR/latwp/ $SOURCE_DIR/apws/*/; do
   year=`basename $x`
   newspaper=`basename $(dirname $x)`
 
   d=$dir/${newspaper}_${year}
-
+  
   if [ $year == latwp ]; then
     d=$dir/latwp_1997
   elif [ $year == english ]; then
@@ -41,6 +45,8 @@ for x in $SOURCE_DIR/nyt/*/ $SOURCE_DIR/latwp/ $SOURCE_DIR/apws/*/; do
 
   mkdir -p $d
 
+  dir_list="$dir_list $d"
+
   list_file=$d/articles.list
   ls $x/*.gz > $list_file
   
@@ -49,13 +55,18 @@ for x in $SOURCE_DIR/nyt/*/ $SOURCE_DIR/latwp/ $SOURCE_DIR/apws/*/; do
   eval utils/split_scp.pl $d/articles.list \
     $d/split$nj/articles.list.{`seq -s, $nj`}
 
-  (
   $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \
-    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB - \| \
-    gzip -c '>' $d/corpus.JOB.gz  || exit 1
-  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
-  rm $d/corpus.*.gz
-  ) &
+    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB \
+    $d/corpus.JOB.gz || touch $dir/.error &
 done
 
 wait
+
+if [ -f $dir/.error ]; then
+  echo "$0: Failed to process files."
+fi
+
+for d in $dir_list; do
+  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
+  rm $d/corpus.*.gz
+done
diff --git a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
index 3c8a50e3fe4..62cf689a649 100755
--- a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
+++ b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
@@ -12,8 +12,12 @@
 import os
 import re
 import subprocess
+import sys
+
 from bs4 import BeautifulSoup
 
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -46,12 +50,11 @@ def get_args():
     return args
 
 
-def normalize_text(text):
+def normalize_text(text, remove_punct=False):
     """Normalizes text and returns the normalized version.
     The normalization involves converting text to upper case.
     """
     text1 = text.strip()
-    # text2 = text_normalization.remove_punctuations(text1)
     text2 = text1.upper()
     text2 = re.sub(r" [ ]*", " ", text2)
     return text2
@@ -62,6 +65,9 @@ def process_file_lines(lines, out_file_handle):
     writes normalized plain text to output stream."""
 
     doc = re.sub(r"<s>", "<s></s>", ''.join(lines))
+    if doc == '':
+        return False
+
     soup = BeautifulSoup(doc, 'lxml')
 
     num_written = 0
@@ -95,23 +101,13 @@ def process_file_lines(lines, out_file_handle):
             raise
     if num_written == 0:
         raise RuntimeError("0 sentences written.")
+    return True
 
 
-def run_command(*args, **kwargs):
-    if type(args[0]) is list:
-        command = ' '.join(args[0])
-    else:
-        command = args[0]
-
-    logger.debug("Running command '%s'", command)
-    p = subprocess.Popen(*args, **kwargs)
-    return p, command
-
-
-def run(args):
+def _run(args):
     """The one that does it all."""
 
-    for line in args.file_list.readlines():
+    for line in open(args.file_list).readlines():
         try:
             file_ = line.strip()
             base_name = os.path.basename(file_)
@@ -122,28 +118,32 @@ def run(args):
 
             logger.info("Running LM pipefile for |%s|...", base_name)
 
-            p = run_command(
+            command = (
                 "gunzip -c {0} | "
-                "local/data_prep/csr_hub4_utils/pare-sgml.perl | "
-                "perl local/data_prep/csr_hub4_utils/bugproc.perl | "
-                "perl local/data_prep/csr_hub4_utils/numhack.perl | "
-                "perl local/data_prep/csr_hub4_utils/numproc.perl "
-                "  -xlocal/data_prep/csr_hub4_utils/num_excp | "
-                "perl local/data_prep/csr_hub4_utils/abbrproc.perl "
-                "  local/data_prep/csr_hub4_utils/abbrlist | "
-                "perl local/data_prep/csr_hub4_utils/puncproc.perl -np"
-                "".format(file_),
-                stdout=subprocess.PIPE, shell=True)
-
-            stdout = p[0].communicate()[0]
-            if p[0].returncode is not 0:
+                "tools/csr4_utils/pare-sgml.perl | "
+                "perl tools/csr4_utils/bugproc.perl | "
+                "perl tools/csr4_utils/numhack.perl | "
+                "perl tools/csr4_utils/numproc.perl "
+                "  -xtools/csr4_utils/num_excp | "
+                "perl tools/csr4_utils/abbrproc.perl "
+                "  tools/csr4_utils/abbrlist | "
+                "perl tools/csr4_utils/puncproc.perl -np"
+                "".format(file_))
+            logger.debug("Running command '%s'", command)
+
+            p = subprocess.Popen(command,
+                                 stdout=subprocess.PIPE, shell=True)
+
+            stdout = p.communicate()[0]
+            if p.returncode is not 0:
                 logger.error(
                     "Command '%s' failed with return status %d",
-                    p[1], p[0].returncode)
+                    command, p.returncode)
                 raise RuntimeError
 
-            process_file_lines(stdout, out_file)
-            out_file.close()
+            if not process_file_lines(stdout, writer):
+                logger.warn("File %s empty or could not be processed.",
+                            file_)
         except Exception:
             logger.error("Failed processing file %s", file_)
             raise
@@ -153,11 +153,11 @@ def main():
     """The main function"""
     try:
         args = get_args()
-        run(args)
+        _run(args)
     except Exception:
-        raise
-    finally:
-        args.file_list.close()
+        logger.error("Failed to process all files", exc_info=True)
+        sys.exit(1)
+    sys.exit(0)
 
 
 if __name__ == '__main__':
diff --git a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
index d7bb36aa3f7..94b02a766a9 100755
--- a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
+++ b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
@@ -1,20 +1,31 @@
 #! /usr/bin/env python
 
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""Prepare NA News Text Corpus (LDC95T21)
+or NA New Text Supplement Corpus (LDC98T30)."""
+
 from __future__ import print_function
-from bs4 import BeautifulSoup
 import argparse
 import gzip
 import logging
+import re
+import subprocess
 import sys
 
-sys.path.insert(0, 'local/lm')
-import text_normalization
+from bs4 import BeautifulSoup
+
+sys.path.insert(0, 'local/data_prep')
+import hub4_utils
 
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
-handler.setLevel(logging.DEBUG)
+handler.setLevel(logging.INFO)
 formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                               "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
@@ -22,31 +33,48 @@
 
 
 def get_args():
+    """Parses command-line arguments."""
+
     parser = argparse.ArgumentParser("Prepare NA News Text corpus (LDC95T21).")
-    parser.add_argument("file_list", type=argparse.FileType('r'),
+    parser.add_argument("--verbose", type=int, choices=[0, 1, 2, 3], default=0,
+                        help="Use larger verbosity for more verbose logging.")
+    parser.add_argument("file_list", type=str,
                         help="List of compressed source files for NA News Text. "
                         "e.g: /export/corpora/LDC/LDC95T21/na_news_1/latwp/1994")
-    parser.add_argument("out_file", type=argparse.FileType('w'),
+    parser.add_argument("out_file", type=str,
                         help="Output file to write to.")
 
     args = parser.parse_args()
 
+    if args.verbose > 2:
+        logger.setLevel(logging.DEBUG)
+        handler.setLevel(logging.DEBUG)
+
     return args
 
 
 def normalize_text(text):
+    """Normalizes text and returns the normalized version.
+    The normalization involves converting text to upper case.
+    """
     text1 = text.strip()
-    text2 = text_normalization.remove_punctuations(text1)
+    text2 = hub4_utils.remove_punctuations(text1)
     text2 = text2.upper()
     return text2
 
 
 def process_file_lines(lines, out_file_handle):
+    """Processes input lines from a file by removing SGML tags and
+    writes normalized plain text to output stream."""
     doc = ''
     for line in lines:
         line = re.sub(r"<artID>([^</])+</DOCID>", "", line)
         line = re.sub(r"<p>", "<p></p>", line)
         doc += line
+
+    if doc == '':
+        return False
+
     soup = BeautifulSoup(doc, 'lxml')
 
     num_written = 0
@@ -67,47 +95,56 @@ def process_file_lines(lines, out_file_handle):
             raise
     if num_written == 0:
         raise RuntimeError("0 sentences written.")
+    return True
 
 
 def _run(args):
-        for line in args.file_list.readlines():
+    """The one that does it all."""
+
+    with gzip.open(args.out_file, 'w') as writer:
+        for line in open(args.file_list).readlines():
             try:
                 file_ = line.strip()
-                p = run_command(
+                command = (
                     "gunzip -c {0} | "
-                    "local/data_prep/csr_hub4_utils/pare-sgml.perl | "
-                    "perl local/data_prep/csr_hub4_utils/bugproc.perl | "
-                    "perl local/data_prep/csr_hub4_utils/numhack.perl | "
-                    "perl local/data_prep/csr_hub4_utils/numproc.perl "
-                    "  -xlocal/data_prep/csr_hub4_utils/num_excp | "
-                    "perl local/data_prep/csr_hub4_utils/abbrproc.perl "
-                    "  local/data_prep/csr_hub4_utils/abbrlist | "
-                    "perl local/data_prep/csr_hub4_utils/puncproc.perl -np"
-                    "".format(file_),
-                    stdout=subprocess.PIPE, shell=True)
-
-                stdout = p[0].communicate()[0]
-                if p[0].returncode is not 0:
+                    "tools/csr4_utils/pare-sgml.perl | "
+                    "perl tools/csr4_utils/bugproc.perl | "
+                    "perl tools/csr4_utils/numhack.perl | "
+                    "perl tools/csr4_utils/numproc.perl "
+                    "  -xtools/csr4_utils/num_excp | "
+                    "perl tools/csr4_utils/abbrproc.perl "
+                    "  tools/csr4_utils/abbrlist | "
+                    "perl tools/csr4_utils/puncproc.perl -np"
+                    "".format(file_))
+                logger.debug("Running command '%s'", command)
+
+                p = subprocess.Popen(command,
+                                     stdout=subprocess.PIPE, shell=True)
+
+                stdout = p.communicate()[0]
+                if p.returncode is not 0:
                     logger.error(
                         "Command '%s' failed with return status %d",
-                        p[1], p[0].returncode)
+                        command, p.returncode)
                     raise RuntimeError
 
-                process_file_lines(stdout, args.out_file)
+                if not process_file_lines(stdout, writer):
+                    logger.warn("File %s empty or could not be processed.",
+                                file_)
             except Exception:
                 logger.error("Failed processing file %s", file_)
                 raise
 
 
 def main():
+    """The main function"""
     try:
         args = get_args()
         _run(args)
     except Exception:
-        raise
-    finally:
-        args.out_file.close()
-        args.file_list.close()
+        logger.error("Failed to process all files", exc_info=True)
+        sys.exit(1)
+    sys.exit(0)
 
 
 if __name__ == '__main__':
diff --git a/egs/hub4_english/s5/local/format_data.sh b/egs/hub4_english/s5/local/format_data.sh
index dfb8c5b953f..ffc63c2bf74 100755
--- a/egs/hub4_english/s5/local/format_data.sh
+++ b/egs/hub4_english/s5/local/format_data.sh
@@ -31,7 +31,7 @@ export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
 mkdir -p data/train_bn96
 
 local/data_prep/format_1996_bn_data.pl \
-  $srcdir/train_bn96/audio.list $srcdir/train_bn96/transcript.txt \
+  $srcdir/train_bn96/audio.list $srcdir/train_bn96/transcripts.txt \
   data/train_bn96 || exit 1
 
 mv data/train_bn96/text data/train_bn96/text.unnorm
@@ -44,7 +44,7 @@ local/normalize_transcripts.pl $noise_word $spoken_noise_word \
 mkdir -p data/train_bn97
 
 local/data_prep/format_1997_bn_data.pl \
-  $srcdir/train_bn97/audio.list $srcdir/train_bn97/transcript.txt \
+  $srcdir/train_bn97/audio.list $srcdir/train_bn97/transcripts.txt \
   data/train_bn97 || exit 1
 
 mv data/train_bn97/text data/train_bn97/text.unnorm
@@ -130,4 +130,4 @@ for d in train_bn96 train_bn97 eval96 eval96.pem dev96pe dev96ue eval97 eval97.p
   utils/fix_data_dir.sh data/${d}
 done
 
-utils/combine_data.sh data/train $train_data_sets
+utils/combine_data.sh data/train data/train_bn96 data/train_bn97
diff --git a/egs/hub4_english/s5/local/train_lm.sh b/egs/hub4_english/s5/local/train_lm.sh
index c527208707c..cc1924b93bb 100755
--- a/egs/hub4_english/s5/local/train_lm.sh
+++ b/egs/hub4_english/s5/local/train_lm.sh
@@ -76,11 +76,11 @@ if [ $stage -le 0 ]; then
     data/local/data/csr95_hub4/train95_text | cut -d ' ' -f 2- > \
     ${dir}/data/text/csr95_hub4.txt
 
-  # # Get text from NA News supplement corpus 
-  # for x in data/local/data/na_news/*; do
-  #   y=`basename $x`
-  #   [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
-  # done
+  # Get text from NA News supplement corpus 
+  for x in data/local/data/na_news_supp; do
+    y=`basename $x`
+    [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
+  done
 
   # for reporting perplexities, we'll use the "real" dev set.
   # note, we can't put it in ${dir}/data/text/, because then pocolm would use
diff --git a/egs/hub4_english/s5/path.sh b/egs/hub4_english/s5/path.sh
index dc878dc9c45..8f83c46cb53 100755
--- a/egs/hub4_english/s5/path.sh
+++ b/egs/hub4_english/s5/path.sh
@@ -1,8 +1,6 @@
-export KALDI_ROOT=/home/vmanoha1/kaldi-diarization-v2
+export KALDI_ROOT=../../..
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
-export PATH=/home/vmanoha1/kaldi-diarization-v2/src/ivectorbin/:$PATH
-export PATH=/home/vmanoha1/kaldi-diarization-v2/src/segmenterbin/:$PATH
 export PATH=$KALDI_ROOT/tools/sctk/bin:$PATH
 export LC_ALL=C
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
index 8844a943fca..2e8c65a407f 100755
--- a/egs/hub4_english/s5/run.sh
+++ b/egs/hub4_english/s5/run.sh
@@ -9,27 +9,30 @@
 . ./path.sh
 
 set -o pipefail
+set -e
 
 mfccdir=`pwd`/mfcc
 nj=40
 stage=-1
 
+. utils/parse_options.sh
+
 # Training corpora
 
 # 1996 English Broadcast News Train (HUB4)
 hub4_96_train_transcripts=/export/corpora/LDC/LDC97T22/hub4_eng_train_trans
-hub4_96_train_speech=/export/corpora/LDC/LDC97S44
+hub4_96_train_speech=/export/corpora/LDC/LDC97S44/data
 # 1997 English Broadcast News Train (HUB4)
 hub4_97_train_transcripts=/export/corpora/LDC/LDC98T28/hub4e97_trans_980217
 hub4_97_train_speech=/export/corpora/LDC/LDC98S71/97_eng_bns_hub4
 # 1996 CSR HUB4 Language Model
 csr_hub4_lm=/export/corpora/LDC/LDC98T31/1996_csr_hub4_model
 # 1995 CSR-IV HUB4 corpus
-csr95_hub4=/export/corpora5/LDC/LDC96S31/csr95_hub4
+csr95_hub4=/export/corpora/LDC/LDC96S31/csr95_hub4
 # North American News Text Corpus
 NA_text=/export/corpora/LDC/LDC95T21
 # North American News Text Supplement Corpus
-NA_text_supp=/export/corpura/LCD/LDC98T30/northam_news_txt_sup
+NA_text_supp=/export/corpora/LDC/LDC98T30/northam_news_txt_sup
 
 # Test corpora
 
@@ -40,7 +43,7 @@ hub4_97_eval=/export/corpora/LDC/LDC2002S11/hub4e_97
 # 1998 HUB4 Broadcast News Evaluation English Test Material
 hub4_98_eval=/export/corpora/LDC/LDC2000S86
 # 1999 HUB4 Broadcast News Evaluation English Test Material
-hub4_99_eval=/export/corpora5/LDC/LDC2000S88/hub4_1999 
+hub4_99_eval=/export/corpora5/LDC/LDC2000S88/hub4_1999
 
 # Test sets used -- Uncomment and keep only test sets needed
 test_sets="eval97.pem"
@@ -87,21 +90,25 @@ if [ $stage -le 2 ]; then
   # Prepare 1995 CSR-IV HUB4 corpus
   local/data_prep/prepare_1995_csr_hub4_corpus.sh \
     $csr95_hub4 data/local/data/csr95_hub4
+fi
 
+if [ $stage -le 3 ]; then
   # Prepare North American News Text Corpus
   local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
      $NA_text data/local/data/na_news
 
   # Prepare North American News Text Supplement Corpus
-  local/data/prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
+  local/data_prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
     $NA_text_supp data/local/data/na_news_supp
+fi
 
+if [ $stage -le 4 ]; then
   # Prepare 1996 CSR HUB4 Language Model
   local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
      $csr_hub4_lm data/local/data/csr96_hub4
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 5 ]; then
   # Prepare 1996 English Broadcast News Dev and Eval (HUB4)
   local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
     $hub4_96_eval \
@@ -120,15 +127,15 @@ if [ $stage -le 3 ]; then
     $hub4_99_eval data/local/data/eval99
 fi
 
-if [ $stage -le 4 ]; then
-  local/format_data.sh 
+if [ $stage -le 6 ]; then
+  local/format_data.sh
 fi
 
-if [ $stage -le 5 ]; then
-  local/train_lm.sh 
+if [ $stage -le 7 ]; then
+  local/train_lm.sh
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 8 ]; then
   local/prepare_dict.sh --dict-suffix "_nosp" \
     data/local/local_lm/data/work/wordlist
 
@@ -136,12 +143,12 @@ if [ $stage -le 6 ]; then
     "<unk>" data/local/lang_tmp_nosp data/lang_nosp
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 9 ]; then
   local/format_lms.sh --local-lm-dir data/local/local_lm
 fi
 
-if [ $stage -le 8 ]; then
-  for x in train $test_sets; do 
+if [ $stage -le 10 ]; then
+  for x in train $test_sets; do
     this_nj=$(cat data/$x/utt2spk | wc -l)
     if [ $this_nj -gt 30 ]; then
       this_nj=30
@@ -155,7 +162,7 @@ if [ $stage -le 8 ]; then
   done
 fi
 
-if [ $stage -le 9 ]; then
+if [ $stage -le 15 ]; then
   utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
   utils/subset_data_dir.sh data/train 2000 data/train_2k
 
@@ -166,7 +173,7 @@ if [ $stage -le 9 ]; then
     data/train_1kshort data/lang_nosp exp/mono0a
 fi
 
-if [ $stage -le 10 ]; then
+if [ $stage -le 16 ]; then
   steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
     data/train_2k data/lang_nosp exp/mono0a exp/mono0a_ali
 
@@ -174,7 +181,7 @@ if [ $stage -le 10 ]; then
     data/train_2k data/lang_nosp exp/mono0a_ali exp/tri1
 fi
 
-if [ $stage -le 10 ]; then
+if [ $stage -le 17 ]; then
   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
     data/train data/lang_nosp exp/tri1 exp/tri1_ali
 
@@ -182,7 +189,7 @@ if [ $stage -le 10 ]; then
     data/train data/lang_nosp exp/tri1_ali exp/tri2
 fi
 
-if [ $stage -le 11 ]; then
+if [ $stage -le 18 ]; then
   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
     data/train data/lang_nosp exp/tri2 exp/tri2_ali
 
@@ -190,7 +197,7 @@ if [ $stage -le 11 ]; then
     data/train data/lang_nosp exp/tri2_ali exp/tri3
 fi
 
-if [ $stage -le 12 ]; then
+if [ $stage -le 19 ]; then
   utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
 
   (
@@ -209,7 +216,7 @@ if [ $stage -le 12 ]; then
   ) &
 fi
 
-if [ $stage -le 13 ]; then
+if [ $stage -le 20 ]; then
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
     data/train data/lang_nosp exp/tri3 exp/tri3_ali
 
@@ -217,7 +224,7 @@ if [ $stage -le 13 ]; then
     data/train data/lang_nosp exp/tri3_ali exp/tri4
 fi
 
-if [ $stage -le 14 ]; then
+if [ $stage -le 21 ]; then
   utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
 
   (
@@ -241,18 +248,18 @@ fi
 
 # The following demonstrates how to use out-of-domain WSJ models to segment long
 # audio recordings of HUB4 with raw unaligned transcripts into short segments
-# with aligned transcripts for training new ASR models. 
+# with aligned transcripts for training new ASR models.
 
 # First run the data preparation stages in WSJ run.sh
 wsj_base=../../wsj/s5   # Change this to the WSJ base directory
 
-if [ $stage -le 15 ]; then
+if [ $stage -le 25 ]; then
   # We copy the prepared data to the current directory
   utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
   utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
   utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
 
-  local/run_segmentation_wsj.sh 
+  local/run_segmentation_wsj.sh
 fi
 
 wait

From 450def04730913b1aafb4c923b9fa3e85ff46f8a Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 27 Dec 2017 09:06:54 -0500
Subject: [PATCH 28/38] Fix some normalization issues

---
 .../normalize_bn96_transcripts.pl}            |  0
 .../data_prep/normalize_bn97_transcripts.pl   | 36 +++++++++++++++++++
 .../process_1996_csr_hub4_lm_filelist.py      |  4 +--
 egs/hub4_english/s5/local/format_data.sh      |  4 +--
 4 files changed, 40 insertions(+), 4 deletions(-)
 rename egs/hub4_english/s5/local/{normalize_transcripts.pl => data_prep/normalize_bn96_transcripts.pl} (100%)
 create mode 100755 egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl

diff --git a/egs/hub4_english/s5/local/normalize_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
similarity index 100%
rename from egs/hub4_english/s5/local/normalize_transcripts.pl
rename to egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
new file mode 100755
index 00000000000..db0437b006a
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/env perl
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ tr:a-z:A-Z:;
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        if ($w ne $noise_word && $w ne $spoken_noise_word) {
+          $w =~ s:[?.,!-]+$::;   # Remove punctuations
+          $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+          $w =~ s:^[\^](.*)$:$1:;  # Remove capitalization marks
+          $w =~ s:_([A-Z])'S$:$1.'S :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:_([A-Z]):$1. :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:[ ]+$::;  # Remove trailing spaces
+        }
+
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
index 62cf689a649..04d50e4343c 100755
--- a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
+++ b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
@@ -36,7 +36,7 @@ def get_args():
     corpus (LDC98T31).""")
     parser.add_argument("--verbose", choices=[0,1,2,3], type=int, default=0,
                         help="Set higher for more verbose logging.")
-    parser.add_argument("file_list", type=argparse.FileType('r'),
+    parser.add_argument("file_list", type=str,
                         help="""List of compressed source files""")
     parser.add_argument("dir", type=str,
                         help="Output directory to dump processed files to")
@@ -141,7 +141,7 @@ def _run(args):
                     command, p.returncode)
                 raise RuntimeError
 
-            if not process_file_lines(stdout, writer):
+            if not process_file_lines(stdout, out_file):
                 logger.warn("File %s empty or could not be processed.",
                             file_)
         except Exception:
diff --git a/egs/hub4_english/s5/local/format_data.sh b/egs/hub4_english/s5/local/format_data.sh
index ffc63c2bf74..98e7eda08ab 100755
--- a/egs/hub4_english/s5/local/format_data.sh
+++ b/egs/hub4_english/s5/local/format_data.sh
@@ -35,7 +35,7 @@ local/data_prep/format_1996_bn_data.pl \
   data/train_bn96 || exit 1
 
 mv data/train_bn96/text data/train_bn96/text.unnorm
-local/normalize_transcripts.pl $noise_word $spoken_noise_word \
+local/data_prep/normalize_bn96_transcripts.pl $noise_word $spoken_noise_word \
   < data/train_bn96/text.unnorm > data/train_bn96/text
 
 ###############################################################################
@@ -48,7 +48,7 @@ local/data_prep/format_1997_bn_data.pl \
   data/train_bn97 || exit 1
 
 mv data/train_bn97/text data/train_bn97/text.unnorm
-local/normalize_transcripts.pl $noise_word $spoken_noise_word \
+local/data_prep/normalize_bn97_transcripts.pl $noise_word $spoken_noise_word \
   < data/train_bn97/text.unnorm > data/train_bn97/text
 
 ###############################################################################

From 148c060d8593386ee29cfcef8a2a0a050c67bce6 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 28 Dec 2017 01:57:38 -0500
Subject: [PATCH 29/38] Not removing hyphens for partial words

---
 .../s5/local/data_prep/normalize_bn97_transcripts.pl            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
index db0437b006a..b27f8da65f8 100755
--- a/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
@@ -22,7 +22,7 @@
     $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
     foreach $w (split (" ",$trans)) {
         if ($w ne $noise_word && $w ne $spoken_noise_word) {
-          $w =~ s:[?.,!-]+$::;   # Remove punctuations
+          $w =~ s:[?.,!]+$::;   # Remove punctuations
           $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
           $w =~ s:^[\^](.*)$:$1:;  # Remove capitalization marks
           $w =~ s:_([A-Z])'S$:$1.'S :g;  # Normalize abbreviations from _f_b_i to f. b. i.

From 3c96a5fdfc31408fdc0128619a5e3ee4f2cfea6f Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 1 Feb 2018 17:52:04 -0500
Subject: [PATCH 30/38] Minor bug fixes

---
 .../s5/local/data_prep/normalize_bn96_transcripts.pl            | 2 +-
 .../s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh         | 1 +
 egs/hub4_english/s5/local/normalize_transcripts.pl              | 1 +
 egs/hub4_english/s5/local/train_lm.sh                           | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)
 create mode 120000 egs/hub4_english/s5/local/normalize_transcripts.pl

diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
index 3359520ace9..3db0e1c71c3 100755
--- a/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
@@ -3,7 +3,7 @@
 # Copyright 2017  Vimal Manohar
 # Apache 2.0
 
-@ARGV == 2 ||  die "usage: normalize_transcript.pl noise_word spoken_noise_word < transcript > transcript2";
+@ARGV == 2 ||  die "usage: normalize_bn96_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
 $noise_word = shift @ARGV;
 $spoken_noise_word = shift @ARGV;
 
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
index 520472657d9..3d9edf01579 100755
--- a/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
@@ -28,6 +28,7 @@ fi
 for uem in $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.uem; do
   python -c '
 import sys, os
+sys.path.insert(0, "local/data_prep")
 import hub4_utils
 uem = sys.argv[1]
 reco, ext = os.path.splitext(os.path.basename(uem))
diff --git a/egs/hub4_english/s5/local/normalize_transcripts.pl b/egs/hub4_english/s5/local/normalize_transcripts.pl
new file mode 120000
index 00000000000..5f1261ccd79
--- /dev/null
+++ b/egs/hub4_english/s5/local/normalize_transcripts.pl
@@ -0,0 +1 @@
+data_prep/normalize_bn96_transcripts.pl
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/train_lm.sh b/egs/hub4_english/s5/local/train_lm.sh
index cc1924b93bb..d4d62403194 100755
--- a/egs/hub4_english/s5/local/train_lm.sh
+++ b/egs/hub4_english/s5/local/train_lm.sh
@@ -87,6 +87,7 @@ if [ $stage -le 0 ]; then
   # it as one of the data sources.
   for x in dev96pe dev96ue eval96 eval97 eval98 eval99_1 eval99_2; do
     cat data/$x/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \
+      awk '!/IGNORE_TIME_SEGMENT_IN_SCORING/ {print $0}' | \
       local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" | \
       cut -d ' ' -f 2- > ${dir}/data/${x}.txt
   done

From 8aeec325301317a1ea7e572fb3d3cde0adc388e5 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sun, 4 Feb 2018 23:52:30 -0500
Subject: [PATCH 31/38] Minor fixes to make the recipe work

---
 egs/hub4_english/s5/local/score_sclite.sh |  3 +--
 egs/hub4_english/s5/path.sh               |  1 +
 egs/hub4_english/s5/run.sh                | 32 ++++++++++++++++++-----
 egs/wsj/s5/steps/dict/train_g2p.sh        |  2 +-
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/egs/hub4_english/s5/local/score_sclite.sh b/egs/hub4_english/s5/local/score_sclite.sh
index ae372b21f04..add014c2dcc 100755
--- a/egs/hub4_english/s5/local/score_sclite.sh
+++ b/egs/hub4_english/s5/local/score_sclite.sh
@@ -66,12 +66,11 @@ if [ $stage -le 0 ]; then
   done
 fi
 
-utils/data/get_reco2utt.sh $data
 if [ $stage -le 1 ]; then
   for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
     if $resolve_ctm_overlaps; then
       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/resolve_ctm_overlaps.LMWT.${wip}.log \
-        steps/resolve_ctm_overlaps.py $data/segments $data/reco2utt \
+        utils/ctm/resolve_ctm_overlaps.py $data/segments \
           $dir/score_LMWT_${wip}/$name.utt_ctm - \| \
         utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
         '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
diff --git a/egs/hub4_english/s5/path.sh b/egs/hub4_english/s5/path.sh
index 8f83c46cb53..49813fc4cd0 100755
--- a/egs/hub4_english/s5/path.sh
+++ b/egs/hub4_english/s5/path.sh
@@ -2,5 +2,6 @@ export KALDI_ROOT=../../..
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
+if [ -f $KALDI_ROOT/tools/env.sh ]; then . $KALDI_ROOT/tools/env.sh; fi
 export PATH=$KALDI_ROOT/tools/sctk/bin:$PATH
 export LC_ALL=C
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
index 2e8c65a407f..8f9f01ca912 100755
--- a/egs/hub4_english/s5/run.sh
+++ b/egs/hub4_english/s5/run.sh
@@ -200,20 +200,26 @@ fi
 if [ $stage -le 19 ]; then
   utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
 
-  (
   for dset in $test_sets; do
+    (
     this_nj=`cat data/$dset/spk2utt | wc -l`
     if [ $this_nj -gt 20 ]; then
       this_nj=20
     fi
     steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-      exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset}
+      exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset} || touch exp/tri3/.error
     steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_nosp_test data/lang_nosp_test_rescore \
       data/${dset} exp/tri3/decode_nosp_${dset} \
-      exp/tri3/decode_nosp_${dset}_rescore
+      exp/tri3/decode_nosp_${dset}_rescore || touch exp/tri3/.error
+    ) &
   done
-  ) &
+  wait
+
+  if [ -f exp/tri3/.error ]; then
+    echo "Decode failed in exp/tri3/decode*"
+    exit 1
+  fi
 fi
 
 if [ $stage -le 20 ]; then
@@ -227,8 +233,8 @@ fi
 if [ $stage -le 21 ]; then
   utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
 
-  (
   for dset in $test_sets; do
+    (
     this_nj=`cat data/$dset/spk2utt | wc -l`
     if [ $this_nj -gt 20 ]; then
       this_nj=20
@@ -239,10 +245,18 @@ if [ $stage -le 21 ]; then
       data/lang_nosp_test data/lang_nosp_test_rescore \
       data/${dset} exp/tri4/decode_nosp_${dset} \
       exp/tri4/decode_nosp_${dset}_rescore
+    ) &
   done
-  ) &
+  wait
+
+  if [ -f exp/tri4/.error ]; then
+    echo "Decode failed in exp/tri4/decode*"
+    exit 1
+  fi
 fi
 
+wait
+
 # %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
 # %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
 
@@ -253,6 +267,12 @@ fi
 # First run the data preparation stages in WSJ run.sh
 wsj_base=../../wsj/s5   # Change this to the WSJ base directory
 
+if [ ! -f $wsj_base/data/train_si284/wav.scp ]; then
+  echo "WSJ data directory $wsj_base/data/train_si284 is not prepared."
+  echo "Run the initial stages of WSJ's run.sh"
+  exit 0
+fi
+
 if [ $stage -le 25 ]; then
   # We copy the prepared data to the current directory
   utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
diff --git a/egs/wsj/s5/steps/dict/train_g2p.sh b/egs/wsj/s5/steps/dict/train_g2p.sh
index 2e4df49b71b..1170d8833df 100755
--- a/egs/wsj/s5/steps/dict/train_g2p.sh
+++ b/egs/wsj/s5/steps/dict/train_g2p.sh
@@ -45,7 +45,7 @@ mkdir -p $wdir/log
 # Optionally remove words that are mapped to a single silence phone from the lexicon.
 if $only_words && [ ! -z "$silence_phones" ]; then
   awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i;a[$1]=s;if(!(s in a)) print $1" "s}' \
-    $silence_phones > $wdir/lexicon_onlywords.txt
+    $lexicon $silence_phones > $wdir/lexicon_onlywords.txt
   lexicon=$wdir/lexicon_onlywords.txt
 fi
 

From a9e1668bbaee780be4d32d29067e10309ef3a5c0 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 5 Feb 2018 08:45:48 -0500
Subject: [PATCH 32/38] Adding results

---
 egs/hub4_english/s5/RESULTS | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/egs/hub4_english/s5/RESULTS b/egs/hub4_english/s5/RESULTS
index 6028a01ce64..c6c719f51fb 100644
--- a/egs/hub4_english/s5/RESULTS
+++ b/egs/hub4_english/s5/RESULTS
@@ -1,2 +1,9 @@
-%WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-%WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+for x in exp/*/decode*; do grep Sum $x/score*/*.ctm.*sys | utils/best_wer.sh ; done | sort -k2,2n
+exit 0
+
+%WER 17.8 | 728 32834 | 84.1 11.8 4.1 1.9 17.8 82.8 | exp/tri4/decode_nosp_eval97.pem_rescore/score_13_0.5/eval97.pem.ctm.filt.sys
+%WER 19.0 | 728 32834 | 83.0 12.7 4.3 2.0 19.0 84.2 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 19.4 | 728 32834 | 82.7 13.1 4.2 2.1 19.4 83.8 | exp/tri3/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 20.5 | 728 32834 | 81.7 13.9 4.4 2.3 20.5 85.0 | exp/tri3/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 23.7 | 728 32834 | 79.0 16.0 5.0 2.7 23.7 85.3 | exp/tri4/decode_nosp_eval97.pem.si/score_12_0.0/eval97.pem.ctm.filt.sys
+%WER 25.7 | 728 32834 | 77.1 17.6 5.3 2.8 25.7 85.9 | exp/tri3/decode_nosp_eval97.pem.si/score_13_0.0/eval97.pem.ctm.filt.sys

From 427c38a0988cdd5f8bb1ffd26791898f1fbdc72f Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 7 Feb 2018 22:55:59 -0500
Subject: [PATCH 33/38] Fixing some bugs in segment long utterances

---
 .../s5/local/tuning/run_segmentation_wsj_a.sh | 279 ++++++++++--------
 egs/wsj/s5/steps/cleanup/internal/get_ctm.sh  |  73 ++---
 .../steps/cleanup/segment_long_utterances.sh  | 126 ++++----
 .../utils/data/convert_data_dir_to_whole.sh   |  91 ++----
 .../internal/combine_segments_to_recording.py |  70 +++++
 5 files changed, 350 insertions(+), 289 deletions(-)
 create mode 100755 egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py

diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
index b2e01650b92..ead266659fe 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
@@ -45,8 +45,11 @@ set -o pipefail
 # %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
 # %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
 
+stage=0
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
 . ./cmd.sh
-. ./path.sh
+. utils/parse_options.sh
 
 segment_stage=-8
 nj=40
@@ -59,40 +62,52 @@ new_affix=2a
 ###############################################################################
 ## Simulate unsegmented HUB4 data directory.
 ###############################################################################
-utils/data/convert_data_dir_to_whole.sh data/train data/train_long
 
-steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
-  data/train_long exp/make_mfcc/train_long mfcc || exit 1
-steps/compute_cmvn_stats.sh data/train_long \
-  exp/make_mfcc/train_long mfcc
-utils/fix_data_dir.sh data/train_long
+if [ $stage -le 1 ]; then
+  utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+  steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" \
+    --nj $reco_nj --write-utt2num-frames true \
+    data/train_long exp/make_mfcc/train_long mfcc || exit 1
+  steps/compute_cmvn_stats.sh data/train_long \
+    exp/make_mfcc/train_long mfcc
+  utils/fix_data_dir.sh data/train_long
+fi
 
 ###############################################################################
 ## Train GMM on out-of-domain WSJ corpus 
 ###############################################################################
 
-steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
+if [ $stage -le 2 ]; then
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
+fi
 
-steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
+if [ $stage -le 3 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
 
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+    data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
+fi
 
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
 
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+    data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
+fi
 
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
+if [ $stage -le 5 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
 
-steps/train_sat.sh --cmd "$train_cmd" \
-  4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+  steps/train_sat.sh --cmd "$train_cmd" \
+    4000 42000 \
+    data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+fi
 
 ###############################################################################
 # Segment long HUB4 recordings and retrieve transcript using 
@@ -100,53 +115,63 @@ steps/train_sat.sh --cmd "$train_cmd" \
 # Use a SAT model trained on train_si284 (wsj_tri3) as seed model for decoding.
 ###############################################################################
 
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage --nj $reco_nj \
-  --max-bad-proportion 0.5 --align-full-hyp false \
-  exp/wsj_tri3 data/lang_nosp data/train_long data/train_long/text \
-  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
-  data/train_reseg_${affix}_spk30sec
-steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
-  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+if [ $stage -le 6 ]; then
+  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+    --stage $segment_stage --nj $reco_nj \
+    --max-bad-proportion 0.5 --align-full-hyp false \
+    exp/wsj_tri3 data/lang_nosp data/train_long \
+    data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
+fi
+
+if [ $stage -le 7 ]; then
+  steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
+    exp/make_mfcc/train_reseg_${affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${affix}
+
+  utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
+    data/train_reseg_${affix}_spk30sec
+  steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
+    exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
+  utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+fi
 
 ###############################################################################
 # Train new in-domain GMM (tri4_a) on retrieved transcripts.
 ###############################################################################
 
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
-    exp/tri4_${affix}/decode_nosp_${dset}_rescore
-done
+if [ $stage -le 7 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_reseg_${affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
+fi
+
+if [ $stage -le 8 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+    data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
+fi
+
+if [ $stage -le 9 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
+      exp/tri4_${affix}/decode_nosp_${dset}_rescore
+  done
+fi
 
 ###############################################################################
 # Segment long HUB4 recordings and retrieve transcript using 
@@ -154,41 +179,49 @@ done
 # Use in-domain SAT model (tri4_a) as seed model for decoding.
 ###############################################################################
 
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage --nj $reco_nj \
-  --max-bad-proportion 0.5 --align-full-hyp false \
-  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
-  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
+if [ $stage -le 10 ]; then
+  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+    --stage $segment_stage --nj $reco_nj \
+    --max-bad-proportion 0.5 --align-full-hyp false \
+    exp/tri4_${affix} data/lang_nosp data/train_long \
+    data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
+fi
 
-steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
-  exp/make_mfcc/train_reseg_${new_affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${new_affix}
+if [ $stage -le 11 ]; then
+  steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
+    exp/make_mfcc/train_reseg_${new_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${new_affix}
+fi
 
 ###############################################################################
 # Train new in-domain GMM (tri4_2a) on retrieved transcripts.
 ###############################################################################
 
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
-    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
-done
+if [ $stage -le 12 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${new_affix} data/lang_nosp \
+    exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_reseg_${new_affix} data/lang_nosp \
+    exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
+fi
+
+if [ $stage -le 13 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
+      exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
+  done
+fi
 
 ###############################################################################
 # Cleanup transcripts
@@ -202,36 +235,42 @@ cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
 dir=${srcdir}_${cleanup_affix}_work
 cleaned_dir=${srcdir}_${cleanup_affix}
 
-steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
-  --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  $srcdir $dir $cleaned_data
+if [ $stage -le 14 ]; then
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+    --cmd "$train_cmd" \
+    data/train_reseg_${new_affix} data/lang_nosp \
+    $srcdir $dir $cleaned_data
+fi
 
 ###############################################################################
 # Train new in-domain GMM (tri4_2a) on cleaned-up transcripts.
 ###############################################################################
 
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 $cleaned_data data/lang_nosp \
-  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
-
-utils/mkgraph.sh data/lang_nosp_test \
-  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
-done
+if [ $stage -le 15 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang_nosp \
+    ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
+fi
+
+if [ $stage -le 16 ]; then
+  utils/mkgraph.sh data/lang_nosp_test \
+    exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+      exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
+      exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+  done
+fi
 
 exit 0
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh b/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
index 35c0a4bd3a8..d004776bede 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
+++ b/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
@@ -4,9 +4,9 @@
 
 # This script produces CTM files from a decoding directory that has lattices
 # present.
-# This is similar to get_ctm.sh, but gets the
-# CTM at the utterance-level.
-
+# This is similar to get_ctm.sh, but gets the CTM at the utterance-level.
+# It can be faster than steps/get_ctm.sh --use-segments false as it splits
+# the process across many jobs.
 
 # begin configuration section.
 cmd=run.pl
@@ -21,8 +21,8 @@ echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir> <ctm-out-dir>"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
@@ -36,43 +36,46 @@ fi
 
 data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
-dir=$3
+decode_dir=$3
+dir=$4
 
-if [ -f $dir/final.mdl ]; then
-  model=$dir/final.mdl
+if [ -f $decode_dir/final.mdl ]; then
+  model=$decode_dir/final.mdl
 else
-  model=$dir/../final.mdl # assume model one level up from decoding dir.
+  model=$decode_dir/../final.mdl # assume model one level up from decoding dir.
 fi
 
-for f in $lang/words.txt $model $dir/lat.1.gz; do
+for f in $lang/words.txt $model $decode_dir/lat.1.gz; do
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
-name=`basename $data`; # e.g. eval2000
+mkdir -p $dir
 
-mkdir -p $dir/scoring/log
+nj=$(cat $decode_dir/num_jobs)
+echo $nj > $dir/num_jobs
 
-if [ $stage -le 0 ]; then
-  nj=$(cat $dir/num_jobs)
-  if [ -f $lang/phones/word_boundary.int ]; then
-    $cmd JOB=1:$nj $dir/scoring/log/get_ctm.JOB.log \
-      set -o pipefail '&&' mkdir -p $dir/score_$lmwt/ '&&' \
-      lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
-      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt \
-      '>' $dir/score_$lmwt/${name}.ctm.JOB || exit 1;
-  elif [ -f $lang/phones/align_lexicon.int ]; then
-    $cmd JOB=1:$nj $dir/scoring/log/get_ctm.JOB.log \
-      set -o pipefail '&&' mkdir -p $dir/score_$lmwt/ '&&' \
-      lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
-      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      lattice-1best ark:- ark:- \| \
-      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt \
-      '>' $dir/score_${lmwt}/${name}.ctm.JOB || exit 1;
-  else
-    echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
-    exit 1;
-  fi
+if [ -f $lang/phones/word_boundary.int ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    set -o pipefail '&&' \
+    lattice-1best --lm-scale=$lmwt "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+    nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/ctm.JOB || exit 1;
+elif [ -f $lang/phones/align_lexicon.int ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    set -o pipefail '&&' \
+    lattice-1best --lm-scale=$lmwt "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+    lattice-1best ark:- ark:- \| \
+    nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/ctm.JOB || exit 1;
+else
+  echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
+  exit 1;
 fi
+
+for n in `seq $nj`; do 
+  cat $dir/ctm.$n
+done > $dir/ctm
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
index 7d83fbb29f8..82a3de8a9b8 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
@@ -6,7 +6,7 @@
 
 . ./path.sh
 
-set -e 
+set -e
 set -o pipefail
 set -u
 
@@ -24,13 +24,13 @@ lmwt=10
 
 # TF-IDF similarity search options
 max_words=1000
-num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity. 
-neighbor_tfidf_threshold=0.5   
+num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity.
+neighbor_tfidf_threshold=0.5
 
 align_full_hyp=false  # Align full hypothesis i.e. trackback from the end to get the alignment.
 
 # First-pass segmentation opts
-# These options are passed to the script 
+# These options are passed to the script
 # steps/cleanup/internal/segment_ctm_edits_mild.py
 segmentation_extra_opts=
 min_split_point_duration=0.1
@@ -56,11 +56,11 @@ Usage: $0 [options] <model-dir> <lang> <data-in> [<text-in> <utt2text>] <segment
  e.g.: $0 exp/wsj_tri2b data/lang_nosp data/train_long data/train_long/text data/train_reseg exp/segment_wsj_long_utts_train
 This script performs segmentation of the data in <data-in> and writes out the
 segmented data (with a segments file) to
-<segmented-data-out> along with the corresponding aligned transcription.  
-Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the 
+<segmented-data-out> along with the corresponding aligned transcription.
+Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the
 raw transcripts to train biased LM for the utterances.
-If <utt2text> is provided, then it should be a mapping from the utterance-ids in 
-<data-in> to the transcript-keys in the file <text-in>, which will be 
+If <utt2text> is provided, then it should be a mapping from the utterance-ids in
+<data-in> to the transcript-keys in the file <text-in>, which will be
 used to train biased LMs for the utterances.
 The purpose of this script is to divide up the input data (which may consist of
 long recordings such as television shows or audiobooks) into segments which are
@@ -86,7 +86,7 @@ if [ $# -eq 7 ]; then
   out_data=$6
   dir=$7
   extra_files="$utt2text"
-else 
+else
   out_data=$4
   dir=$5
 fi
@@ -103,12 +103,12 @@ data_id=`basename $data`
 mkdir -p $dir
 
 data_uniform_seg=$dir/${data_id}_uniform_seg
-  
+
 frame_shift=`utils/data/get_frame_shift.sh $data`
 
-# First we split the data into segments of around 30s long, on which 
-# it would be possible to do a decoding. 
-# A diarization step will be added in the future. 
+# First we split the data into segments of around 30s long, on which
+# it would be possible to do a decoding.
+# A diarization step will be added in the future.
 if [ $stage -le 1 ]; then
   echo "$0: Stage 1 (Splitting data directory $data into uniform segments)"
 
@@ -133,12 +133,12 @@ if [ $stage -le 2 ]; then
       $data $dir/uniform_sub_segments $dir/${data_id}_uniform_seg.temp
 
     utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max \
-      $dir/${data_id}_uniform_seg.temp $data_uniform_seg 
+      $dir/${data_id}_uniform_seg.temp $data_uniform_seg
   else
     utils/data/subsegment_data_dir.sh \
       $data $dir/uniform_sub_segments $data_uniform_seg
   fi
-  
+
   utils/fix_data_dir.sh $data_uniform_seg
 
   # Compute new cmvn stats for the segmented data directory
@@ -157,19 +157,19 @@ if [ $stage -le 3 ]; then
   cp $srcdir/phones.txt $dir 2>/dev/null || true
 
   mkdir -p $graph_dir
-  
-  # Make graphs w.r.t. to the original text (usually recording-level) 
+
+  # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
     --nj $nj --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
-    # and then copy it to the sub-segments. 
+    # and then copy it to the sub-segments.
     cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
       utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
       sort -k1,1 > \
       $graph_dir/HCLG.fsts.scp
   else
-    # and then copy it to the sub-segments. 
+    # and then copy it to the sub-segments.
     cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
       utils/apply_map.pl -f 2 $utt2text | \
       utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
@@ -187,13 +187,13 @@ mkdir -p $decode_dir
 
 if [ $stage -le 4 ]; then
   echo "$0: Decoding with biased language models..."
-  
+
   if [ -f $srcdir/trans.1 ]; then
     steps/cleanup/decode_fmllr_segmentation.sh \
       --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
       --skip-scoring true --allow-partial false \
       $graph_dir $data_uniform_seg $decode_dir
-  else 
+  else
     steps/cleanup/decode_segmentation.sh \
       --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
       --skip-scoring true --allow-partial false \
@@ -205,22 +205,22 @@ if [ $stage -le 5 ]; then
   steps/cleanup/internal/get_ctm.sh \
     --lmwt $lmwt --cmd "$cmd --mem 4G" \
     --print-silence true \
-    $data_uniform_seg $lang $decode_dir
+    $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt
 fi
 
-# Split the original text into documents, over which we can do 
+# Split the original text into documents, over which we can do
 # searching reasonably efficiently. Also get a mapping from the original
 # text to the created documents (i.e. text2doc)
-# Since the Smith-Waterman alignment is linear in the length of the 
-# text, we want to keep it reasonably small (a few thousand words). 
+# Since the Smith-Waterman alignment is linear in the length of the
+# text, we want to keep it reasonably small (a few thousand words).
 
 if [ $stage -le 6 ]; then
   # Split the reference text into documents.
   mkdir -p $dir/docs
-    
+
   # text2doc is a mapping from the original transcript to the documents
   # it is split into.
-  # The format is 
+  # The format is
   # <original-transcript> <doc1> <doc2> ...
   steps/cleanup/internal/split_text_into_docs.pl --max-words $max_words \
     $text $dir/docs/doc2text $dir/docs/docs.txt
@@ -230,11 +230,11 @@ fi
 if [ $stage -le 7 ]; then
   # Get TF-IDF for the reference documents.
   echo $nj > $dir/docs/num_jobs
-  
+
   utils/split_data.sh $data_uniform_seg $nj
 
   mkdir -p $dir/docs/split$nj/
-  
+
   # First compute IDF stats
   $cmd $dir/log/compute_source_idf_stats.log \
     steps/cleanup/internal/compute_tf_idf.py \
@@ -242,23 +242,23 @@ if [ $stage -le 7 ]; then
     --idf-weighting-scheme="log" \
     --output-idf-stats=$dir/docs/idf_stats.txt \
     $dir/docs/docs.txt $dir/docs/src_tf_idf.txt
-  
+
   # Split documents so that they can be accessed easily by parallel jobs.
   mkdir -p $dir/docs/split$nj/
   sdir=$dir/docs/split$nj
   for n in `seq $nj`; do
 
-    # old2new_utts is a mapping from the original segments to the 
+    # old2new_utts is a mapping from the original segments to the
     # new segments created by uniformly segmenting.
     # The format is <old-utterance> <new-utt1> <new-utt2> ...
     utils/filter_scp.pl $data_uniform_seg/split$nj/$n/utt2spk $dir/uniform_sub_segments | \
       cut -d ' ' -f 1,2 | utils/utt2spk_to_spk2utt.pl > $sdir/old2new_utts.$n.txt
 
     if [ ! -z "$utt2text" ]; then
-      # utt2text, if provided, is a mapping from the <old-utterance> to 
+      # utt2text, if provided, is a mapping from the <old-utterance> to
       # <original-transript>.
-      # Since text2doc is mapping from <original-transcript> to documents, we 
-      # first have to find the original-transcripts that are in the current 
+      # Since text2doc is mapping from <original-transcript> to documents, we
+      # first have to find the original-transcripts that are in the current
       # split.
       utils/filter_scp.pl $sdir/old2new_utts.$n.txt $utt2text | \
         cut -d ' ' -f 2 | sort -u | \
@@ -273,13 +273,13 @@ if [ $stage -le 7 ]; then
       $sdir/docs.$n.txt
   done
 
-  # Compute TF-IDF for the source documents. 
+  # Compute TF-IDF for the source documents.
   $cmd JOB=1:$nj $dir/docs/log/get_tfidf_for_source_texts.JOB.log \
     steps/cleanup/internal/compute_tf_idf.py \
       --tf-weighting-scheme="raw" \
       --idf-weighting-scheme="log" \
       --input-idf-stats=$dir/docs/idf_stats.txt \
-      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt 
+      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt
 
   sdir=$dir/docs/split$nj
   # Make $sdir an absolute pathname.
@@ -288,15 +288,15 @@ if [ $stage -le 7 ]; then
   for n in `seq $nj`; do
     awk -v f="$sdir/src_tf_idf.$n.txt" '{print $1" "f}' \
       $sdir/text2doc.$n
-  done | perl -ane 'BEGIN { %tfidfs = (); } 
-  { 
-    if (!defined $tfidfs{$F[0]}) { 
-      $tfidfs{$F[0]} = $F[1]; 
-    } 
-  } 
+  done | perl -ane 'BEGIN { %tfidfs = (); }
+  {
+    if (!defined $tfidfs{$F[0]}) {
+      $tfidfs{$F[0]} = $F[1];
+    }
+  }
   END {
-  while(my ($k, $v) = each %tfidfs) { 
-    print "$k $v\n"; 
+  while(my ($k, $v) = each %tfidfs) {
+    print "$k $v\n";
   } }' > $dir/docs/source2tf_idf.scp
 fi
 
@@ -317,18 +317,18 @@ if [ $stage -le 9 ]; then
   sdir=$dir/query_docs/split$nj
   mkdir -p $sdir
 
-  # Compute TF-IDF for the query documents (decode hypotheses). 
+  # Compute TF-IDF for the query documents (decode hypotheses).
   # The output is an archive of TF-IDF indexed by the query.
-  $cmd JOB=1:$nj $dir/lats/log/compute_query_tf_idf.JOB.log \
+  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/compute_query_tf_idf.JOB.log \
     steps/cleanup/internal/ctm_to_text.pl --non-scored-words $dir/non_scored_words.txt \
-      $dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm.JOB \| \
+      $decode_dir/ctm_$lmwt/ctm.JOB \| \
     steps/cleanup/internal/compute_tf_idf.py \
       --tf-weighting-scheme="normalized" \
       --idf-weighting-scheme="log" \
       --input-idf-stats=$dir/docs/idf_stats.txt \
       --accumulate-over-docs=false \
       - $sdir/query_tf_idf.JOB.ark.txt
-  
+
   # The relevant documents can be found using TF-IDF similarity and nearby
   # documents can also be picked for the Smith-Waterman alignment stage.
 
@@ -345,15 +345,15 @@ if [ $stage -le 9 ]; then
   # The query TF-IDFs are all indexed by the utterance-id of the sub-segments.
   # The source TF-IDFs use the document-ids created by splitting the reference
   # text into documents.
-  # For each query, we need to retrieve the documents that were created from 
-  # the same original utterance that the sub-segment was from. For this, 
-  # we have to load the source TF-IDF that has those documents. This 
+  # For each query, we need to retrieve the documents that were created from
+  # the same original utterance that the sub-segment was from. For this,
+  # we have to load the source TF-IDF that has those documents. This
   # information is provided using the option --source-text-id2tf-idf-file.
-  # The output of this script is a file where the first column is the 
+  # The output of this script is a file where the first column is the
   # query-id (i.e. sub-segment-id) and the remaining columns, which is at least
   # one in number and a maxmium of (1 + 2 * num-neighbors-to-search) columns
   # is the document-ids for the retrieved documents.
-  $cmd JOB=1:$nj $dir/lats/log/retrieve_similar_docs.JOB.log \
+  $cmd JOB=1:$nj $dir/log/retrieve_similar_docs.JOB.log \
     steps/cleanup/internal/retrieve_similar_docs.py \
       --query-tfidf=$dir/query_docs/split$nj/query_tf_idf.JOB.ark.txt \
       --source-text-id2tfidf=$dir/docs/source2tf_idf.scp \
@@ -362,8 +362,8 @@ if [ $stage -le 9 ]; then
       --num-neighbors-to-search=$num_neighbors_to_search \
       --neighbor-tfidf-threshold=$neighbor_tfidf_threshold \
       --relevant-docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt
-  
-  $cmd JOB=1:$nj $dir/lats/log/get_ctm_edits.JOB.log \
+
+  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/get_ctm_edits.JOB.log \
     steps/cleanup/internal/stitch_documents.py \
       --query2docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt \
       --input-documents=$dir/docs/split$nj/docs.JOB.txt \
@@ -371,18 +371,18 @@ if [ $stage -le 9 ]; then
     steps/cleanup/internal/align_ctm_ref.py --eps-symbol='"<eps>"' \
       --oov-word="'`cat $lang/oov.txt`'" --symbol-table=$lang/words.txt \
       --hyp-format=CTM --align-full-hyp=$align_full_hyp \
-      --hyp=$dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm.JOB --ref=- \
-      --output=$dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm_edits.JOB 
-  
+      --hyp=$decode_dir/ctm_$lmwt/ctm.JOB --ref=- \
+      --output=$decode_dir/ctm_$lmwt/ctm_edits.JOB
+
   for n in `seq $nj`; do
-    cat $dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm_edits.$n 
-  done > $dir/lats/score_$lmwt/ctm_edits
-  
+    cat $decode_dir/ctm_$lmwt/ctm_edits.$n
+  done > $decode_dir/ctm_$lmwt/ctm_edits
+
 fi
 
 if [ $stage -le 10 ]; then
   steps/cleanup/internal/resolve_ctm_edits_overlaps.py \
-    ${data_uniform_seg}/segments $dir/lats/score_$lmwt/ctm_edits $dir/ctm_edits
+    ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits
 fi
 
 if [ $stage -le 11 ]; then
@@ -421,7 +421,7 @@ if [ $stage -le 13 ]; then
   --splitting.min-silence-length=$min_silence_length_to_split_at
   --splitting.min-non-scored-length=$min_non_scored_length_to_split_at
   )
-  
+
   $cmd $dir/log/segment_ctm_edits.log \
     steps/cleanup/internal/segment_ctm_edits_mild.py \
       ${segmentation_opts[@]} $segmentation_extra_opts \
diff --git a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
index 0d6713d52b8..caef0daaf48 100755
--- a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
+++ b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
@@ -11,9 +11,7 @@ set -o pipefail
 
 . ./path.sh
 
-cmd=run.pl
-
-. parse_options.sh
+. utils/parse_options.sh
 
 if [ $# -ne 2 ]; then
   echo "Usage: convert_data_dir_to_whole.sh <in-data> <out-data>"
@@ -32,75 +30,26 @@ fi
 
 mkdir -p $dir
 cp $data/wav.scp $dir
-cp $data/reco2file_and_channel $dir
-rm -f $dir/{utt2spk,text} || true
+if [ -f $data/reco2file_and_channel ]; then 
+  cp $data/reco2file_and_channel $dir; 
+fi
+
+mkdir -p $dir/.backup
+mv $dir/feats.scp $dir/cmvn.scp $dir/.backup
+
+rm $dir/utt2spk || true
 
 [ -f $data/stm ] && cp $data/stm $dir
 [ -f $data/glm ] && cp $data/glm $dir
 
-text_files=
-[ -f $data/text ] && text_files="$data/text $dir/text"
-
-# Combine utt2spk and text from the segments into utt2spk and text for the whole
-# recording.
-cat $data/segments | perl -e '
-if (scalar @ARGV == 3) {
-  ($utt2spk_in, $text_in, $text_out) = @ARGV;
-} elsif (scalar @ARGV == 1) {
-  $utt2spk_in = $ARGV[0];
-} else {
-  die "Unexpected number of arguments";
-}
-
-if (defined $text_in) {
-  open(TI, "<$text_in") || die "Error: fail to open $text_in\n";
-  open(TO, ">$text_out") || die "Error: fail to open $text_out\n";
-}
-open(UI, "<$utt2spk_in") || die "Error: fail to open $utt2spk_in\n";
-
-my %file2utt = ();
-while (<STDIN>) {
-  chomp;
-  my @col = split;
-  @col >= 4 or die "bad line $_\n";
-
-  if (! defined $file2utt{$col[1]}) {
-    $file2utt{$col[1]} = [];
-  }
-  push @{$file2utt{$col[1]}}, $col[0]; 
-}
-
-my %text = ();
-my %utt2spk = ();
-
-while (<UI>) {
-  chomp; 
-  my @col = split;
-  $utt2spk{$col[0]} = $col[1];
-}
-
-if (defined $text_in) {
-  while (<TI>) {
-    chomp;
-    my @col = split;
-    @col >= 1 or die "bad line $_\n";
-
-    my $utt = shift @col;
-    $text{$utt} = join(" ", @col);
-  }
-}
-
-foreach $file (keys %file2utt) {
-  my @utts = @{$file2utt{$file}};
-  print "$file $file\n";
-
-  if (defined $text_in) {
-    $text_line = "";
-    print TO "$file $text_line\n";
-  }
-}
-' $data/utt2spk $text_files > $dir/utt2spk
-
-utils/spk2utt_to_utt2spk.pl $dir/utt2spk > $dir/spk2utt
-
-utils/fix_data_dir.sh $dir
+text_opts=
+if [ -f $data/text ]; then
+  text_opts="--text-in=$data/text --text-out=$dir/text"
+fi
+
+utils/data/internal/combine_segments_to_recording.py \
+  $text_opts $data/segments $dir/utt2spk || exit 1
+
+utils/fix_data_dir.sh $dir || exit 1
+
+exit 0
diff --git a/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py b/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
new file mode 100755
index 00000000000..87d17aacad1
--- /dev/null
+++ b/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import argparse
+import sys
+import collections
+from collections import defaultdict
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""
+        This script combines segments into utterances at
+        recording-level and write out new utt2spk file with reco-id as the
+        speakers. If text-in and text-out are provided, then
+        the transcription from text-in are combined into recording-level
+        transcription and written to text-out.""")
+
+    parser.add_argument("--text-in", help="Input text file")
+    parser.add_argument("--text-out", help="Output text file")
+    parser.add_argument("segments_in", help="Input segments file")
+    parser.add_argument("utt2spk_out", help="Output utt2spk file")
+
+    args = parser.parse_args()
+
+    if args.text_in is not None:
+        if args.text_out is None:
+            raise Exception("--text-out is required if --text-in is provided.")
+
+    return args
+
+
+def main():
+    args = get_args()
+
+    utt2reco = {}
+    segments_for_reco = defaultdict(list)
+    for line in open(args.segments_in):
+        parts = line.strip().split()
+
+        if len(parts) < 4:
+            raise TypeError("bad line in segments file {}".format(line))
+
+        utt = parts[0]
+        reco = parts[1]
+        start_time = parts[2]
+        end_time = parts[3]
+
+        segments_for_reco[reco].append((utt, start_time, end_time))
+        utt2reco[utt] = reco
+
+    text = {}
+    if args.text_in is not None:
+        for line in open(args.text_in):
+            parts = line.strip().split()
+            text[parts[0]] = " ".join(parts[1:])
+
+        with open(args.text_out, 'w') as text_writer, \
+                open(args.utt2spk_out, 'w') as utt2spk_writer:
+            for reco, segments_in_reco in segments_for_reco.items():
+                text_for_reco = " ".join([text[seg[0]] for seg in sorted(
+                    segments_in_reco, key=lambda x:(x[1], x[2]))])
+                print("{0} {1}".format(reco, text_for_reco), file=text_writer)
+                print ("{0} {0}".format(reco), file=utt2spk_writer)
+    else:
+        with open(args.utt2spk_out, 'w') as utt2spk_writer:
+            for reco in segments_for_reco.keys():
+                print ("{0} {0}".format(reco), file=utt2spk_writer)
+
+
+if __name__ == "__main__":
+    main()

From cd7e12a42163719d3842406f570d117294f1c4d9 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 8 Feb 2018 18:30:45 -0500
Subject: [PATCH 34/38] Making changes based on comments

---
 .../s5/local/tuning/run_segmentation_wsj_a.sh | 112 ++++++++++--------
 egs/hub4_english/s5/run.sh                    |  20 +---
 .../steps/cleanup/segment_long_utterances.sh  |   3 +-
 .../internal/get_ctm.sh => get_ctm_fast.sh}   |   2 +-
 .../utils/data/convert_data_dir_to_whole.sh   |  11 +-
 .../internal/combine_segments_to_recording.py |  36 +++---
 6 files changed, 88 insertions(+), 96 deletions(-)
 rename egs/wsj/s5/steps/{cleanup/internal/get_ctm.sh => get_ctm_fast.sh} (98%)

diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
index ead266659fe..b16b8e341d0 100755
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
+++ b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
@@ -33,7 +33,7 @@ set -o pipefail
 # %WER 19.8 | 728 32834 | 82.1 12.6 5.3 1.9 19.8 85.9 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_15_1.0/eval97.pem.ctm.filt.sys
 # %WER 20.9 | 728 32834 | 81.2 13.5 5.3 2.1 20.9 86.5 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
 
-# In-domain GMM (From step 5)
+# Stage 2 in-domain GMM (From step 5)
 # %WER 20.4 | 728 32834 | 81.7 13.1 5.2 2.1 20.4 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
 # %WER 21.3 | 728 32834 | 80.7 13.7 5.6 2.0 21.3 87.1 | exp/tri4_2a/decode_nosp_eval97.pem/score_15_1.0/eval97.pem.ctm.filt.sys
 
@@ -46,18 +46,32 @@ set -o pipefail
 # %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
 
 stage=0
+segment_stage=-8
+nj=40
+reco_nj=80
+stage1_affix=a    # For steps 2, 3 and 4 above
+stage2_affix=2a   # For step 5 above
+
+# WSJ run.sh must be run until the data preparation stage
+wsj_base=../../wsj/s5   # Change this to the WSJ base directory
 
 if [ -f ./path.sh ]; then . ./path.sh; fi
 . ./cmd.sh
+
 . utils/parse_options.sh
 
-segment_stage=-8
-nj=40
-reco_nj=80
-affix=a
-new_affix=2a
+if [ ! -f $wsj_base/data/train_si284/wav.scp ]; then
+  echo "WSJ data directory $wsj_base/data/train_si284 is not prepared."
+  echo "Run the initial stages of WSJ's run.sh"
+  exit 0
+fi
 
-. utils/parse_options.sh
+if [ $stage -le 0 ]; then
+  # We copy the prepared data to the current directory
+  utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
+  utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
+  utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
+fi
 
 ###############################################################################
 ## Simulate unsegmented HUB4 data directory.
@@ -120,19 +134,19 @@ if [ $stage -le 6 ]; then
     --stage $segment_stage --nj $reco_nj \
     --max-bad-proportion 0.5 --align-full-hyp false \
     exp/wsj_tri3 data/lang_nosp data/train_long \
-    data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
+    data/train_reseg_${stage1_affix} exp/segment_long_utts_${stage1_affix}_train
 fi
 
 if [ $stage -le 7 ]; then
-  steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-    exp/make_mfcc/train_reseg_${affix} mfcc
-  utils/fix_data_dir.sh data/train_reseg_${affix}
-
-  utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
-    data/train_reseg_${affix}_spk30sec
-  steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
-    exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
-  utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix} \
+    exp/make_mfcc/train_reseg_${stage1_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}
+
+  utils/data/modify_speaker_info.sh data/train_reseg_${stage1_affix} \
+    data/train_reseg_${stage1_affix}_spk30sec
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix}_spk30sec \
+    exp/make_mfcc/train_reseg_${stage1_affix}_spk30sec mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}_spk30sec
 fi
 
 ###############################################################################
@@ -141,35 +155,35 @@ fi
 
 if [ $stage -le 7 ]; then
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/train_reseg_${affix}_spk30sec data/lang_nosp \
-    exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${stage1_affix}
 
   steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-    data/train_reseg_${affix}_spk30sec data/lang_nosp \
-    exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3_ali_train_reseg_${stage1_affix} exp/tri3_${stage1_affix} 
 fi
 
 if [ $stage -le 8 ]; then
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix} exp/tri3_${stage1_affix}_ali
 
   steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-    data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix}_ali exp/tri4_${stage1_affix}
 fi
 
 if [ $stage -le 9 ]; then
-  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage1_affix}/{,graph_nosp_test}
   for dset in eval97.pem; do
     this_nj=`cat data/$dset/spk2utt | wc -l`
     if [ $this_nj -gt 20 ]; then
       this_nj=20
     fi
     steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-      exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
+      exp/tri4_${stage1_affix}/graph_nosp_test data/$dset exp/tri4_${stage1_affix}/decode_nosp_${dset}
     steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_nosp_test data/lang_nosp_test_rescore \
-      data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
-      exp/tri4_${affix}/decode_nosp_${dset}_rescore
+      data/${dset} exp/tri4_${stage1_affix}/decode_nosp_${dset} \
+      exp/tri4_${stage1_affix}/decode_nosp_${dset}_rescore
   done
 fi
 
@@ -183,14 +197,14 @@ if [ $stage -le 10 ]; then
   steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
     --stage $segment_stage --nj $reco_nj \
     --max-bad-proportion 0.5 --align-full-hyp false \
-    exp/tri4_${affix} data/lang_nosp data/train_long \
-    data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
+    exp/tri4_${stage1_affix} data/lang_nosp data/train_long \
+    data/train_reseg_${stage2_affix} exp/segment_long_utts_${stage2_affix}_train
 fi
 
 if [ $stage -le 11 ]; then
-  steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
-    exp/make_mfcc/train_reseg_${new_affix} mfcc
-  utils/fix_data_dir.sh data/train_reseg_${new_affix}
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage2_affix} \
+    exp/make_mfcc/train_reseg_${stage2_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage2_affix}
 fi
 
 ###############################################################################
@@ -199,27 +213,27 @@ fi
 
 if [ $stage -le 12 ]; then
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/train_reseg_${new_affix} data/lang_nosp \
-    exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    exp/tri4_${stage1_affix} exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix}
 
   steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-    data/train_reseg_${new_affix} data/lang_nosp \
-    exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix} exp/tri4_${stage2_affix} 
 fi
 
 if [ $stage -le 13 ]; then
-  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage2_affix}/{,graph_nosp_test}
   for dset in eval97.pem; do
     this_nj=`cat data/$dset/spk2utt | wc -l`
     if [ $this_nj -gt 20 ]; then
       this_nj=20
     fi
     steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-      exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
+      exp/tri4_${stage2_affix}/graph_nosp_test data/$dset exp/tri4_${stage2_affix}/decode_nosp_${dset}
     steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_nosp_test data/lang_nosp_test_rescore \
-      data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
-      exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
+      data/${dset} exp/tri4_${stage2_affix}/decode_nosp_${dset} \
+      exp/tri4_${stage2_affix}/decode_nosp_${dset}_rescore
   done
 fi
 
@@ -230,15 +244,15 @@ fi
 
 cleanup_stage=-1
 cleanup_affix=cleaned
-srcdir=exp/tri4_${new_affix}
-cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
+srcdir=exp/tri4_${stage2_affix}
+cleaned_data=data/train_reseg_${stage2_affix}_${cleanup_affix}
 dir=${srcdir}_${cleanup_affix}_work
 cleaned_dir=${srcdir}_${cleanup_affix}
 
 if [ $stage -le 14 ]; then
   steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
     --cmd "$train_cmd" \
-    data/train_reseg_${new_affix} data/lang_nosp \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
     $srcdir $dir $cleaned_data
 fi
 
@@ -252,24 +266,24 @@ if [ $stage -le 15 ]; then
 
   steps/train_sat.sh --cmd "$train_cmd" \
     5000 100000 $cleaned_data data/lang_nosp \
-    ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
+    ${srcdir}_ali_${cleanup_affix} exp/tri5_${stage2_affix}_${cleanup_affix}
 fi
 
 if [ $stage -le 16 ]; then
   utils/mkgraph.sh data/lang_nosp_test \
-    exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
+    exp/tri5_${stage2_affix}_${cleanup_affix}/{,graph_nosp_test}
   for dset in eval97.pem; do
     this_nj=`cat data/$dset/spk2utt | wc -l`
     if [ $this_nj -gt 20 ]; then
       this_nj=20
     fi
     steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-      exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
-      exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
+      exp/tri5_${stage2_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}
     steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_nosp_test data/lang_nosp_test_rescore \
-      data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
-      exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+      data/${dset} exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset} \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
   done
 fi
 
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
index 8f9f01ca912..a1afc6db644 100755
--- a/egs/hub4_english/s5/run.sh
+++ b/egs/hub4_english/s5/run.sh
@@ -264,23 +264,5 @@ wait
 # audio recordings of HUB4 with raw unaligned transcripts into short segments
 # with aligned transcripts for training new ASR models.
 
-# First run the data preparation stages in WSJ run.sh
-wsj_base=../../wsj/s5   # Change this to the WSJ base directory
-
-if [ ! -f $wsj_base/data/train_si284/wav.scp ]; then
-  echo "WSJ data directory $wsj_base/data/train_si284 is not prepared."
-  echo "Run the initial stages of WSJ's run.sh"
-  exit 0
-fi
-
-if [ $stage -le 25 ]; then
-  # We copy the prepared data to the current directory
-  utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
-  utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
-  utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
-
-  local/run_segmentation_wsj.sh
-fi
-
-wait
+# local/run_segmentation_wsj.sh
 exit 0
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
index 82a3de8a9b8..16350fdb032 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
@@ -202,8 +202,7 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ $stage -le 5 ]; then
-  steps/cleanup/internal/get_ctm.sh \
-    --lmwt $lmwt --cmd "$cmd --mem 4G" \
+  steps/get_ctm_fast.sh --lmwt $lmwt --cmd "$cmd --mem 4G" \
     --print-silence true \
     $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt
 fi
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh b/egs/wsj/s5/steps/get_ctm_fast.sh
similarity index 98%
rename from egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
rename to egs/wsj/s5/steps/get_ctm_fast.sh
index d004776bede..613061f7df8 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
+++ b/egs/wsj/s5/steps/get_ctm_fast.sh
@@ -30,7 +30,7 @@ if [ $# -ne 4 ]; then
   echo "                                    # not equal to 0.01 seconds"
   echo "e.g.:"
   echo "$0 data/train data/lang exp/tri4a/decode/"
-  echo "See also: steps/get_train_ctm.sh"
+  echo "See also: steps/get_ctm.sh"
   exit 1;
 fi
 
diff --git a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
index caef0daaf48..917f4c9728d 100755
--- a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
+++ b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
@@ -1,6 +1,6 @@
 #! /bin/bash
 
-# Copyright 2016  Vimal Manohar
+# Copyright 2016-18  Vimal Manohar
 # Apache 2.0
 
 # This scripts converts a data directory into a "whole" data directory
@@ -42,13 +42,14 @@ rm $dir/utt2spk || true
 [ -f $data/stm ] && cp $data/stm $dir
 [ -f $data/glm ] && cp $data/glm $dir
 
-text_opts=
+utils/data/internal/combine_segments_to_recording.py \
+  --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1
+
 if [ -f $data/text ]; then
-  text_opts="--text-in=$data/text --text-out=$dir/text"
+  utils/apply_map.pl -f 2 $data/text < $dir/reco2sorted_utts > $dir/text || exit 1
 fi
 
-utils/data/internal/combine_segments_to_recording.py \
-  $text_opts $data/segments $dir/utt2spk || exit 1
+rm $dir/reco2sorted_utts
 
 utils/fix_data_dir.sh $dir || exit 1
 
diff --git a/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py b/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
index 87d17aacad1..8d810c68fe1 100755
--- a/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
+++ b/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
@@ -1,4 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
+# Copyright 2018  Vimal Manohar
+# Apache 2.0
 
 from __future__ import print_function
 import argparse
@@ -10,21 +13,19 @@ def get_args():
     parser = argparse.ArgumentParser(description="""
         This script combines segments into utterances at
         recording-level and write out new utt2spk file with reco-id as the
-        speakers. If text-in and text-out are provided, then
-        the transcription from text-in are combined into recording-level
-        transcription and written to text-out.""")
-
-    parser.add_argument("--text-in", help="Input text file")
-    parser.add_argument("--text-out", help="Output text file")
+        speakers. If --write-reco2utt is provided, it writes a mapping from
+        recording-id to the list of utterances sorted by start and end times.
+        This map can be used to combine text corresponding to the segments to
+        recording-level.""")
+
+    parser.add_argument("--write-reco2utt", help="If provided, writes a "
+                        "mapping from recording-id to list of utterances "
+                        "sorted by start and end times.")
     parser.add_argument("segments_in", help="Input segments file")
     parser.add_argument("utt2spk_out", help="Output utt2spk file")
 
     args = parser.parse_args()
 
-    if args.text_in is not None:
-        if args.text_out is None:
-            raise Exception("--text-out is required if --text-in is provided.")
-
     return args
 
 
@@ -47,18 +48,13 @@ def main():
         segments_for_reco[reco].append((utt, start_time, end_time))
         utt2reco[utt] = reco
 
-    text = {}
-    if args.text_in is not None:
-        for line in open(args.text_in):
-            parts = line.strip().split()
-            text[parts[0]] = " ".join(parts[1:])
-
-        with open(args.text_out, 'w') as text_writer, \
+    if args.write_reco2utt is not None:
+        with open(args.write_reco2utt, 'w') as reco2utt_writer, \
                 open(args.utt2spk_out, 'w') as utt2spk_writer:
             for reco, segments_in_reco in segments_for_reco.items():
-                text_for_reco = " ".join([text[seg[0]] for seg in sorted(
+                utts = ' '.join([seg[0] for seg in sorted(
                     segments_in_reco, key=lambda x:(x[1], x[2]))])
-                print("{0} {1}".format(reco, text_for_reco), file=text_writer)
+                print("{0} {1}".format(reco, utts), file=reco2utt_writer)
                 print ("{0} {0}".format(reco), file=utt2spk_writer)
     else:
         with open(args.utt2spk_out, 'w') as utt2spk_writer:

From befd7eedc89f590a92d5f2b38a680db97bbec029 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sun, 11 Feb 2018 15:42:29 -0500
Subject: [PATCH 35/38] Update results and remove multiple versions of script

---
 .../s5/local/run_segmentation_wsj.sh          | 314 +++++++++++++++++-
 .../s5/local/tuning/run_segmentation_wsj_a.sh | 290 ----------------
 .../s5/local/tuning/run_segmentation_wsj_b.sh | 236 -------------
 3 files changed, 313 insertions(+), 527 deletions(-)
 mode change 120000 => 100755 egs/hub4_english/s5/local/run_segmentation_wsj.sh
 delete mode 100755 egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
 delete mode 100755 egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh

diff --git a/egs/hub4_english/s5/local/run_segmentation_wsj.sh b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
deleted file mode 120000
index d58b4098cc4..00000000000
--- a/egs/hub4_english/s5/local/run_segmentation_wsj.sh
+++ /dev/null
@@ -1 +0,0 @@
-tuning/run_segmentation_wsj_a.sh
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/run_segmentation_wsj.sh b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
new file mode 100755
index 00000000000..a321abe9a29
--- /dev/null
+++ b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
@@ -0,0 +1,313 @@
+#!/bin/bash
+
+# Copyright 2016-18  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to use out-of-domain WSJ models to segment long
+# audio recordings of HUB4 with raw unaligned transcripts into short segments
+# with aligned transcripts for training new ASR models. 
+
+# The overall procedure is as follow:
+# 1) Train a GMM on out-of-domain WSJ corpus
+# 2) Decode broadcast news recordings (HUB4) with WSJ GMM and 4-gram biased LM 
+#    trained on the raw unprocessed transcript. 
+# 3) Use the CTM output to segment the recordings keep the best matched
+#    audio and text.
+# 4) Train an in-domain GMM on the above data. 
+# 5) Repeat steps 2, 3 and 4 using the new in-domain GMM.
+# 6) Re-segment the data retaining only the "clean" part of the data.
+
+# See the script steps/cleanup/segment_long_utterances.sh for details about 
+# audio-transcript alignment (Step 2, 3)
+# See the script steps/cleanup/clean_and_segment_data.sh for details about 
+# cleaning up transcripts (Step 6)
+
+# In step 3, if you need to align the full hypothesis of audio with the 
+# reference text as opposed to finding the best matching substring, 
+# then use --align-full-hyp true in the scripts below.
+
+# WSJ models (From step 1)
+# %WER 29.9 | 728 32834 | 72.9 17.8 9.3 2.8 29.9 92.7 | exp/wsj_tri3/decode_nosp_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+# %WER 30.8 | 728 32834 | 71.8 18.4 9.8 2.6 30.8 92.3 | exp/wsj_tri3/decode_nosp_eval97.pem/score_17_0.0/eval97.pem.ctm.filt.sys
+
+# In-domain GMM (From step 4) -- 107 hrs
+# %WER 19.1 | 728 32834 | 82.7 12.2 5.1 1.9 19.1 86.4 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_14_1.0/eval97.pem.ctm.filt.sys
+# %WER 20.4 | 728 32834 | 81.6 13.1 5.3 2.1 20.4 87.4 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+# Stage 2 in-domain GMM (From step 5) -- 124 hrs
+# %WER 20.9 | 728 32834 | 81.2 13.6 5.3 2.1 20.9 87.4 | exp/tri4_2a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.8 | 728 32834 | 82.3 12.9 4.7 2.2 19.8 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_12_0.5/eval97.pem.ctm.filt.sys
+
+# GMM trained on cleaned transcripts (From step 6) -- 120 hrs
+# %WER 18.4 | 728 32834 | 83.6 11.9 4.5 2.1 18.4 84.8 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.6 | 728 32834 | 82.5 12.7 4.8 2.2 19.6 86.8 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+# Oracle HUB4 transcripts -- 148 hrs
+# %WER 17.8 | 728 32834 | 84.1 11.8 4.1 1.9 17.8 82.8 | exp/tri4/decode_nosp_eval97.pem_rescore/score_13_0.5/eval97.pem.ctm.filt.sys
+# %WER 19.0 | 728 32834 | 83.0 12.7 4.3 2.0 19.0 84.2 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+stage=0
+segment_stage=-8
+nj=40
+reco_nj=80
+stage1_affix=a    # For steps 2, 3 and 4 above
+stage2_affix=2a   # For step 5 above
+
+# WSJ run.sh must be run until the data preparation stage
+wsj_base=../../wsj/s5   # Change this to the WSJ base directory
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./cmd.sh
+
+. utils/parse_options.sh
+
+if [ ! -f $wsj_base/data/train_si284/wav.scp ]; then
+  echo "WSJ data directory $wsj_base/data/train_si284 is not prepared."
+  echo "Run the initial stages of WSJ's run.sh"
+  exit 0
+fi
+
+if [ $stage -le 0 ]; then
+  # We copy the prepared data to the current directory
+  utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
+  utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
+  utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
+fi
+
+###############################################################################
+## Simulate unsegmented HUB4 data directory.
+###############################################################################
+
+if [ $stage -le 1 ]; then
+  utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+  steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" \
+    --nj $reco_nj --write-utt2num-frames true \
+    data/train_long exp/make_mfcc/train_long mfcc
+  steps/compute_cmvn_stats.sh data/train_long \
+    exp/make_mfcc/train_long mfcc
+  utils/fix_data_dir.sh data/train_long
+fi
+
+###############################################################################
+## Train GMM on out-of-domain WSJ corpus 
+###############################################################################
+
+if [ $stage -le 2 ]; then
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a
+fi
+
+if [ $stage -le 3 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84
+
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+    data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1
+fi
+
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+    data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    4000 42000 \
+    data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+fi
+
+if [ $stage -le 6 ]; then
+  utils/mkgraph.sh data/lang_nosp_test \
+    exp/wsj_tri3/{,graph_nosp_test}
+
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/wsj_tri3/graph_nosp_test data/$dset \
+      exp/wsj_tri3/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/wsj_tri3/decode_nosp_${dset} \
+      exp/wsj_tri3/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Segment long HUB4 recordings and retrieve transcript using 
+# Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3) as seed model for decoding.
+###############################################################################
+
+if [ $stage -le 7 ]; then
+  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+    --stage $segment_stage --nj $reco_nj \
+    --max-bad-proportion 0.5 --align-full-hyp false \
+    exp/wsj_tri3 data/lang_nosp data/train_long \
+    data/train_reseg_${stage1_affix} exp/segment_long_utts_${stage1_affix}_train
+fi
+
+if [ $stage -le 8 ]; then
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix} \
+    exp/make_mfcc/train_reseg_${stage1_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}
+
+  utils/data/modify_speaker_info.sh data/train_reseg_${stage1_affix} \
+    data/train_reseg_${stage1_affix}_spk30sec
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix}_spk30sec \
+    exp/make_mfcc/train_reseg_${stage1_affix}_spk30sec mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}_spk30sec
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_a) on retrieved transcripts.
+###############################################################################
+
+if [ $stage -le 9 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${stage1_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3_ali_train_reseg_${stage1_affix} exp/tri3_${stage1_affix} 
+fi
+
+if [ $stage -le 10 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix} exp/tri3_${stage1_affix}_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix}_ali exp/tri4_${stage1_affix}
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage1_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4_${stage1_affix}/graph_nosp_test data/$dset exp/tri4_${stage1_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4_${stage1_affix}/decode_nosp_${dset} \
+      exp/tri4_${stage1_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Segment long HUB4 recordings and retrieve transcript using 
+# Smith-Waterman alignment.
+# Use in-domain SAT model (tri4_a) as seed model for decoding.
+###############################################################################
+
+if [ $stage -le 12 ]; then
+  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+    --stage $segment_stage --nj $reco_nj \
+    --max-bad-proportion 0.5 --align-full-hyp false \
+    exp/tri4_${stage1_affix} data/lang_nosp data/train_long \
+    data/train_reseg_${stage2_affix} exp/segment_long_utts_${stage2_affix}_train
+fi
+
+if [ $stage -le 13 ]; then
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage2_affix} \
+    exp/make_mfcc/train_reseg_${stage2_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage2_affix}
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_2a) on retrieved transcripts.
+###############################################################################
+
+if [ $stage -le 14 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    exp/tri4_${stage1_affix} exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix} exp/tri4_${stage2_affix} 
+fi
+
+if [ $stage -le 15 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage2_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4_${stage2_affix}/graph_nosp_test data/$dset exp/tri4_${stage2_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4_${stage2_affix}/decode_nosp_${dset} \
+      exp/tri4_${stage2_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Cleanup transcripts
+# Use in-domain SAT model (tri4_2a) as seed model for decoding.
+###############################################################################
+
+cleanup_stage=-1
+cleanup_affix=cleaned
+srcdir=exp/tri4_${stage2_affix}
+cleaned_data=data/train_reseg_${stage2_affix}_${cleanup_affix}
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 16 ]; then
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+    --cmd "$train_cmd" \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    $srcdir $dir $cleaned_data
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_2a) on cleaned-up transcripts.
+###############################################################################
+
+if [ $stage -le 17 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang_nosp \
+    ${srcdir}_ali_${cleanup_affix} exp/tri5_${stage2_affix}_${cleanup_affix}
+fi
+
+if [ $stage -le 18 ]; then
+  utils/mkgraph.sh data/lang_nosp_test \
+    exp/tri5_${stage2_affix}_${cleanup_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset} \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+exit 0
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
deleted file mode 100755
index b16b8e341d0..00000000000
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_a.sh
+++ /dev/null
@@ -1,290 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016  Vimal Manohar
-# Apache 2.0
-
-set -e
-set -o pipefail
-
-# This script demonstrates how to use out-of-domain WSJ models to segment long
-# audio recordings of HUB4 with raw unaligned transcripts into short segments
-# with aligned transcripts for training new ASR models. 
-
-# The overall procedure is as follow:
-# 1) Train a GMM on out-of-domain WSJ corpus
-# 2) Decode broadcast news recordings (HUB4) with WSJ GMM and 4-gram biased LM 
-#    trained on the raw unprocessed transcript. 
-# 3) Use the CTM output to segment the recordings keep the best matched
-#    audio and text.
-# 4) Train an in-domain GMM on the above data. 
-# 5) Repeat steps 2, 3 and 4 using the new in-domain GMM.
-# 6) Re-segment the data retaining only the "clean" part of the data.
-
-# See the script steps/cleanup/segment_long_utterances.sh for details about 
-# audio-transcript alignment (Step 2, 3)
-# See the script steps/cleanup/clean_and_segment_data.sh for details about 
-# cleaning up transcripts (Step 6)
-
-# WSJ models (From step 1)
-# %WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
-# %WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
-
-# In-domain GMM (From step 4)
-# %WER 19.8 | 728 32834 | 82.1 12.6 5.3 1.9 19.8 85.9 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_15_1.0/eval97.pem.ctm.filt.sys
-# %WER 20.9 | 728 32834 | 81.2 13.5 5.3 2.1 20.9 86.5 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
-
-# Stage 2 in-domain GMM (From step 5)
-# %WER 20.4 | 728 32834 | 81.7 13.1 5.2 2.1 20.4 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-# %WER 21.3 | 728 32834 | 80.7 13.7 5.6 2.0 21.3 87.1 | exp/tri4_2a/decode_nosp_eval97.pem/score_15_1.0/eval97.pem.ctm.filt.sys
-
-# GMM trained on cleaned transcripts (From step 6)
-# %WER 19.1 | 728 32834 | 83.1 12.2 4.7 2.2 19.1 85.0 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
-# %WER 20.2 | 728 32834 | 81.9 13.0 5.1 2.1 20.2 87.1 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
-
-# Oracle HUB4 transcripts
-# %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-# %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
-
-stage=0
-segment_stage=-8
-nj=40
-reco_nj=80
-stage1_affix=a    # For steps 2, 3 and 4 above
-stage2_affix=2a   # For step 5 above
-
-# WSJ run.sh must be run until the data preparation stage
-wsj_base=../../wsj/s5   # Change this to the WSJ base directory
-
-if [ -f ./path.sh ]; then . ./path.sh; fi
-. ./cmd.sh
-
-. utils/parse_options.sh
-
-if [ ! -f $wsj_base/data/train_si284/wav.scp ]; then
-  echo "WSJ data directory $wsj_base/data/train_si284 is not prepared."
-  echo "Run the initial stages of WSJ's run.sh"
-  exit 0
-fi
-
-if [ $stage -le 0 ]; then
-  # We copy the prepared data to the current directory
-  utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
-  utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
-  utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
-fi
-
-###############################################################################
-## Simulate unsegmented HUB4 data directory.
-###############################################################################
-
-if [ $stage -le 1 ]; then
-  utils/data/convert_data_dir_to_whole.sh data/train data/train_long
-
-  steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" \
-    --nj $reco_nj --write-utt2num-frames true \
-    data/train_long exp/make_mfcc/train_long mfcc || exit 1
-  steps/compute_cmvn_stats.sh data/train_long \
-    exp/make_mfcc/train_long mfcc
-  utils/fix_data_dir.sh data/train_long
-fi
-
-###############################################################################
-## Train GMM on out-of-domain WSJ corpus 
-###############################################################################
-
-if [ $stage -le 2 ]; then
-  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-    data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
-fi
-
-if [ $stage -le 3 ]; then
-  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-    data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
-
-  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
-    data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
-fi
-
-if [ $stage -le 4 ]; then
-  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
-
-  steps/train_lda_mllt.sh --cmd "$train_cmd" \
-    --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
-    data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
-fi
-
-if [ $stage -le 5 ]; then
-  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
-
-  steps/train_sat.sh --cmd "$train_cmd" \
-    4000 42000 \
-    data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-fi
-
-###############################################################################
-# Segment long HUB4 recordings and retrieve transcript using 
-# Smith-Waterman alignment.
-# Use a SAT model trained on train_si284 (wsj_tri3) as seed model for decoding.
-###############################################################################
-
-if [ $stage -le 6 ]; then
-  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-    --stage $segment_stage --nj $reco_nj \
-    --max-bad-proportion 0.5 --align-full-hyp false \
-    exp/wsj_tri3 data/lang_nosp data/train_long \
-    data/train_reseg_${stage1_affix} exp/segment_long_utts_${stage1_affix}_train
-fi
-
-if [ $stage -le 7 ]; then
-  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix} \
-    exp/make_mfcc/train_reseg_${stage1_affix} mfcc
-  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}
-
-  utils/data/modify_speaker_info.sh data/train_reseg_${stage1_affix} \
-    data/train_reseg_${stage1_affix}_spk30sec
-  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix}_spk30sec \
-    exp/make_mfcc/train_reseg_${stage1_affix}_spk30sec mfcc
-  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}_spk30sec
-fi
-
-###############################################################################
-# Train new in-domain GMM (tri4_a) on retrieved transcripts.
-###############################################################################
-
-if [ $stage -le 7 ]; then
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
-    exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${stage1_affix}
-
-  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
-    exp/wsj_tri3_ali_train_reseg_${stage1_affix} exp/tri3_${stage1_affix} 
-fi
-
-if [ $stage -le 8 ]; then
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix} exp/tri3_${stage1_affix}_ali
-
-  steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix}_ali exp/tri4_${stage1_affix}
-fi
-
-if [ $stage -le 9 ]; then
-  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage1_affix}/{,graph_nosp_test}
-  for dset in eval97.pem; do
-    this_nj=`cat data/$dset/spk2utt | wc -l`
-    if [ $this_nj -gt 20 ]; then
-      this_nj=20
-    fi
-    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-      exp/tri4_${stage1_affix}/graph_nosp_test data/$dset exp/tri4_${stage1_affix}/decode_nosp_${dset}
-    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-      data/lang_nosp_test data/lang_nosp_test_rescore \
-      data/${dset} exp/tri4_${stage1_affix}/decode_nosp_${dset} \
-      exp/tri4_${stage1_affix}/decode_nosp_${dset}_rescore
-  done
-fi
-
-###############################################################################
-# Segment long HUB4 recordings and retrieve transcript using 
-# Smith-Waterman alignment.
-# Use in-domain SAT model (tri4_a) as seed model for decoding.
-###############################################################################
-
-if [ $stage -le 10 ]; then
-  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-    --stage $segment_stage --nj $reco_nj \
-    --max-bad-proportion 0.5 --align-full-hyp false \
-    exp/tri4_${stage1_affix} data/lang_nosp data/train_long \
-    data/train_reseg_${stage2_affix} exp/segment_long_utts_${stage2_affix}_train
-fi
-
-if [ $stage -le 11 ]; then
-  steps/compute_cmvn_stats.sh data/train_reseg_${stage2_affix} \
-    exp/make_mfcc/train_reseg_${stage2_affix} mfcc
-  utils/fix_data_dir.sh data/train_reseg_${stage2_affix}
-fi
-
-###############################################################################
-# Train new in-domain GMM (tri4_2a) on retrieved transcripts.
-###############################################################################
-
-if [ $stage -le 12 ]; then
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/train_reseg_${stage2_affix} data/lang_nosp \
-    exp/tri4_${stage1_affix} exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix}
-
-  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-    data/train_reseg_${stage2_affix} data/lang_nosp \
-    exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix} exp/tri4_${stage2_affix} 
-fi
-
-if [ $stage -le 13 ]; then
-  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage2_affix}/{,graph_nosp_test}
-  for dset in eval97.pem; do
-    this_nj=`cat data/$dset/spk2utt | wc -l`
-    if [ $this_nj -gt 20 ]; then
-      this_nj=20
-    fi
-    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-      exp/tri4_${stage2_affix}/graph_nosp_test data/$dset exp/tri4_${stage2_affix}/decode_nosp_${dset}
-    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-      data/lang_nosp_test data/lang_nosp_test_rescore \
-      data/${dset} exp/tri4_${stage2_affix}/decode_nosp_${dset} \
-      exp/tri4_${stage2_affix}/decode_nosp_${dset}_rescore
-  done
-fi
-
-###############################################################################
-# Cleanup transcripts
-# Use in-domain SAT model (tri4_2a) as seed model for decoding.
-###############################################################################
-
-cleanup_stage=-1
-cleanup_affix=cleaned
-srcdir=exp/tri4_${stage2_affix}
-cleaned_data=data/train_reseg_${stage2_affix}_${cleanup_affix}
-dir=${srcdir}_${cleanup_affix}_work
-cleaned_dir=${srcdir}_${cleanup_affix}
-
-if [ $stage -le 14 ]; then
-  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
-    --cmd "$train_cmd" \
-    data/train_reseg_${stage2_affix} data/lang_nosp \
-    $srcdir $dir $cleaned_data
-fi
-
-###############################################################################
-# Train new in-domain GMM (tri4_2a) on cleaned-up transcripts.
-###############################################################################
-
-if [ $stage -le 15 ]; then
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
-
-  steps/train_sat.sh --cmd "$train_cmd" \
-    5000 100000 $cleaned_data data/lang_nosp \
-    ${srcdir}_ali_${cleanup_affix} exp/tri5_${stage2_affix}_${cleanup_affix}
-fi
-
-if [ $stage -le 16 ]; then
-  utils/mkgraph.sh data/lang_nosp_test \
-    exp/tri5_${stage2_affix}_${cleanup_affix}/{,graph_nosp_test}
-  for dset in eval97.pem; do
-    this_nj=`cat data/$dset/spk2utt | wc -l`
-    if [ $this_nj -gt 20 ]; then
-      this_nj=20
-    fi
-    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-      exp/tri5_${stage2_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
-      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}
-    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-      data/lang_nosp_test data/lang_nosp_test_rescore \
-      data/${dset} exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset} \
-      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
-  done
-fi
-
-exit 0
diff --git a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh b/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
deleted file mode 100755
index d8ee7e9fc6b..00000000000
--- a/egs/hub4_english/s5/local/tuning/run_segmentation_wsj_b.sh
+++ /dev/null
@@ -1,236 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016  Vimal Manohar
-# Apache 2.0
-
-set -e
-set -o pipefail
-
-# This script demonstrates how to use out-of-domain WSJ models to segment long
-# audio recordings of HUB4 with raw unaligned transcripts into short segments
-# with aligned transcripts for training new ASR models. 
-
-# The overall procedure is as follow:
-# 1) Train a GMM on out-of-domain WSJ corpus
-# 2) Decode broadcast news recordings (HUB4) with WSJ GMM and 4-gram biased LM 
-#    trained on the raw unprocessed transcript. 
-# 3) Use the CTM output to segment the recordings into aligned audio and 
-#    text.
-# 4) Train an in-domain GMM on the above data. 
-# 5) Repeat steps 2, 3 and 4 using the new in-domain GMM.
-# 6) Re-segment the data retaining only the "clean" part of the data.
-
-# This is similar to _a but aligns full hypothesis of segment with reference
-# (not just the best matching subset of hypothesis).
-
-
-# WSJ models (From step 1)
-# %WER 29.5 | 728 32834 | 73.1 17.7 9.2 2.6 29.5 92.2 | exp/wsj_tri3/decode_nosp_test_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
-# %WER 30.4 | 728 32834 | 72.3 18.3 9.4 2.7 30.4 92.3 | exp/wsj_tri3/decode_nosp_test_eval97.pem/score_16_0.0/eval97.pem.ctm.filt.sys
-
-# In-domain GMM (From step 4)
-# %WER 19.8 | 728 32834 | 82.3 12.7 5.1 2.1 19.8 88.0 | exp/tri4_b/decode_nosp_eval97.pem_rescore/score_14_0.5/eval97.pem.ctm.filt.sys
-# %WER 20.9 | 728 32834 | 81.2 13.4 5.4 2.1 20.9 88.7 | exp/tri4_b/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
-
-# In-domain GMM (From step 5)
-# %WER 19.9 | 728 32834 | 82.3 13.2 4.5 2.3 19.9 88.9 | exp/tri4_2b/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
-# %WER 21.2 | 728 32834 | 81.3 14.3 4.4 2.5 21.2 89.8 | exp/tri4_2b/decode_nosp_eval97.pem/score_12_0.0/eval97.pem.ctm.filt.sys
-
-# GMM trained on cleaned transcripts (From step 6)
-# %WER 19.0 | 728 32834 | 83.1 12.5 4.4 2.1 19.0 87.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-# %WER 20.2 | 728 32834 | 82.1 13.4 4.5 2.3 20.2 89.0 | exp/tri5_2b_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
-
-# Oracle HUB4 transcripts
-# %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
-# %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
-
-. ./cmd.sh
-. ./path.sh
-
-segment_stage=-8
-nj=40
-reco_nj=80
-affix=b
-new_affix=2b
-
-. utils/parse_options.sh
-
-###############################################################################
-## Simulate unsegmented HUB4 data directory.
-###############################################################################
-utils/data/convert_data_dir_to_whole.sh data/train data/train_long
-
-steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" --nj $reco_nj \
-  data/train_long exp/make_mfcc/train_long mfcc || exit 1
-steps/compute_cmvn_stats.sh data/train_long \
-  exp/make_mfcc/train_long mfcc
-utils/fix_data_dir.sh data/train_long
-
-###############################################################################
-## Train GMM on out-of-domain WSJ corpus 
-###############################################################################
-
-steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
-
-steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84 || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284 || exit 1;
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2 || exit 1;
-
-steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284 || exit 1
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  4000 42000 \
-  data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
-
-###############################################################################
-# Segment long HUB4 recordings and retrieve transcript using 
-# modified Levenshtein alignment.
-# Use a SAT model trained on train_si284 (wsj_tri3) as seed model for decoding.
-###############################################################################
-
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage --nj $reco_nj \
-  --max-bad-proportion 0.5 --align-full-hyp true \
-  exp/wsj_tri3 data/lang_nosp data/train_long data/train_long/text \
-  data/train_reseg_${affix} exp/segment_long_utts_${affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${affix} \
-  exp/make_mfcc/train_reseg_${affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}
-
-utils/data/modify_speaker_info.sh data/train_reseg_${affix} \
-  data/train_reseg_${affix}_spk30sec
-steps/compute_cmvn_stats.sh data/train_reseg_${affix}_spk30sec \
-  exp/make_mfcc/train_reseg_${affix}_spk30sec mfcc
-utils/fix_data_dir.sh data/train_reseg_${affix}_spk30sec
-
-###############################################################################
-# Train new in-domain GMM (tri4_b) on retrieved transcripts.
-###############################################################################
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp \
-  exp/wsj_tri3_ali_train_reseg_${affix} exp/tri3_${affix} 
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix} exp/tri3_${affix}_ali
-
-steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
-  data/train_reseg_${affix}_spk30sec data/lang_nosp exp/tri3_${affix}_ali exp/tri4_${affix}
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${affix}/graph_nosp_test data/$dset exp/tri4_${affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${affix}/decode_nosp_${dset} \
-    exp/tri4_${affix}/decode_nosp_${dset}_rescore
-done
-
-###############################################################################
-# Segment long HUB4 recordings and retrieve transcript using 
-# modified Levenshtein alignment.
-# Use in-domain SAT model (tri4_b) as seed model for decoding.
-###############################################################################
-
-steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
-  --stage $segment_stage --nj $reco_nj \
-  --max-bad-proportion 0.5 --align-full-hyp true \
-  exp/tri4_${affix} data/lang_nosp data/train_long data/train_long/text \
-  data/train_reseg_${new_affix} exp/segment_long_utts_${new_affix}_train
-
-steps/compute_cmvn_stats.sh data/train_reseg_${new_affix} \
-  exp/make_mfcc/train_reseg_${new_affix} mfcc
-utils/fix_data_dir.sh data/train_reseg_${new_affix}
-
-###############################################################################
-# Train new in-domain GMM (tri4_2b) on retrieved transcripts.
-###############################################################################
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix} exp/tri4_${affix}_ali_train_reseg_${new_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  exp/tri4_${affix}_ali_train_reseg_${new_affix} exp/tri4_${new_affix} 
-
-utils/mkgraph.sh data/lang_nosp_test exp/tri4_${new_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri4_${new_affix}/graph_nosp_test data/$dset exp/tri4_${new_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri4_${new_affix}/decode_nosp_${dset} \
-    exp/tri4_${new_affix}/decode_nosp_${dset}_rescore
-done
-
-###############################################################################
-# Cleanup transcripts
-# Use in-domain SAT model (tri4_2b) as seed model for decoding.
-###############################################################################
-
-cleanup_stage=-1
-cleanup_affix=cleaned
-srcdir=exp/tri4_${new_affix}
-cleaned_data=data/train_reseg_${new_affix}_${cleanup_affix}
-dir=${srcdir}_${cleanup_affix}_work
-cleaned_dir=${srcdir}_${cleanup_affix}
-
-steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
-  --cmd "$train_cmd" \
-  data/train_reseg_${new_affix} data/lang_nosp \
-  $srcdir $dir $cleaned_data
-
-###############################################################################
-# Train new in-domain GMM (tri4_2b) on cleaned-up transcripts.
-###############################################################################
-
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 $cleaned_data data/lang_nosp \
-  ${srcdir}_ali_${cleanup_affix} exp/tri5_${new_affix}_${cleanup_affix}
-
-utils/mkgraph.sh data/lang_nosp_test \
-  exp/tri5_${new_affix}_${cleanup_affix}/{,graph_nosp_test}
-for dset in eval97.pem; do
-  this_nj=`cat data/$dset/spk2utt | wc -l`
-  if [ $this_nj -gt 20 ]; then
-    this_nj=20
-  fi
-  steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
-    exp/tri5_${new_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}
-  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-    data/lang_nosp_test data/lang_nosp_test_rescore \
-    data/${dset} exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset} \
-    exp/tri5_${new_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
-done
-
-exit 0

From f846849588d4836a7114eb193960c3dd8084aa26 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 12 Feb 2018 14:32:25 -0500
Subject: [PATCH 36/38] Fixing some issues based on comments

---
 egs/hub4_english/s5/local/prepare_dict.sh     | 78 ++-----------------
 .../s5/local/run_cleanup_segmentation.sh      | 10 +--
 egs/hub4_english/s5/local/train_lm.sh         | 32 ++++----
 egs/hub4_english/s5/run.sh                    |  3 +-
 egs/wsj/s5/steps/dict/train_g2p.sh            |  6 +-
 .../utils/data/convert_data_dir_to_whole.sh   |  2 +-
 6 files changed, 28 insertions(+), 103 deletions(-)

diff --git a/egs/hub4_english/s5/local/prepare_dict.sh b/egs/hub4_english/s5/local/prepare_dict.sh
index 5bbc6266b67..a44be00f374 100755
--- a/egs/hub4_english/s5/local/prepare_dict.sh
+++ b/egs/hub4_english/s5/local/prepare_dict.sh
@@ -98,7 +98,7 @@ if [ $stage -le 0 ]; then
 
   (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<unk> UNK'; echo '<NOISE> NSN'; ) | \
     cat - $dir/dict.cmu > $dir/lexicon2_raw.txt
-  awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist
+  awk '{print $1}' $dir/lexicon2_raw.txt > $dir/wordlist_with_prons
 
   cat <<EOF >$dir/silence_phones.txt
 SIL
@@ -122,86 +122,20 @@ fi
 export PATH=$PATH:`pwd`/local/dict
 
 if [ $stage -le 3 ]; then
-  cat $wordlist | python -c '
-import sys
-
-words = {}
-for line in open(sys.argv[1]).readlines():
-  words[line.strip()] = 1
-
-oovs = {}
-for line in sys.stdin.readlines():
-  word = line.strip()
-  if word not in words:
-    oovs[word] = 1
-
-for oov in oovs:
-  print (oov)' $dir/orig_wordlist | sort -u > $dir/oovlist
-
-  cat $dir/oovlist | \
-    get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms
-fi
-
-mkdir -p $dir/f $dir/b # forward, backward directions of rules...
-
-if [ $stage -le 4 ]; then
-  # forward is normal suffix
-  # rules, backward is reversed (prefix rules).  These
-  # dirs contain stuff we create while making the rule-based
-  # extensions to the dictionary.
-
-  # Remove ; and , from words, if they are present; these
-  # might crash our scripts, as they are used as separators there.
-  filter_dict.pl $dir/dict.cmu > $dir/f/dict
-  cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
-  reverse_dict.pl $dir/f/dict > $dir/b/dict
-  reverse_dict.pl $dir/f/oovs > $dir/b/oovs
-fi
-
-if [ $stage -le 5 ]; then
-  # The next stage takes a few minutes.
-  # Note: the forward stage takes longer, as English is
-  # mostly a suffix-based language, and there are more rules
-  # that it finds.
-  for d in $dir/f $dir/b; do
-   (
-     cd $d
-     cat dict | get_rules.pl 2>get_rules.log >rules
-     get_rule_hierarchy.pl rules >hierarchy
-     awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
-       limit_candidate_prons.pl hierarchy | \
-       score_prons.pl dict | \
-       count_rules.pl >rule.counts
-     # the sort command below is just for convenience of reading.
-     score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
-     get_candidate_prons.pl rules.with_scores dict oovs | \
-       limit_candidate_prons.pl hierarchy > oovs.candidates
-   ) &
-  done
-  wait
-fi
-
-if [ $stage -le 6 ]; then
-  # Merge the candidates.
-  reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
-  select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
-    > $dir/dict.oovs
-
-  cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
-  awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
-  sort $dir/oovlist | { diff - $dir/oovlist.handled || true; } | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
+  utils/filter_scp.pl --exclude $dir/wordlist_with_prons < $wordlist | \
+    sort -u > $dir/oovlist
 fi
 
 if [ $stage -le 7 ]; then
   steps/dict/apply_g2p.sh --cmd "$train_cmd" \
-    $dir/oovlist.not_handled exp/g2p exp/g2p/oov_lex
+    $dir/oovlist exp/g2p exp/g2p/oov_lex
   cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \
     $dir/dict.oovs_g2p
 fi
 
 if [ $stage -le 8 ]; then
   # the sort | uniq is to remove a duplicated pron from cmudict.
-  cat $dir/lexicon2_raw.txt $dir/dict.oovs_merged $dir/dict.oovs_g2p | sort | uniq > \
+  cat $dir/lexicon2_raw.txt $dir/dict.oovs_g2p | sort | uniq > \
     $dir/lexicon.txt || exit 1;
   # lexicon.txt is without the _B, _E, _S, _I markers.
 
@@ -209,5 +143,3 @@ if [ $stage -le 8 ]; then
 fi
 
 echo "Dictionary preparation succeeded"
-
-
diff --git a/egs/hub4_english/s5/local/run_cleanup_segmentation.sh b/egs/hub4_english/s5/local/run_cleanup_segmentation.sh
index 2a56884446c..e91ec318650 100755
--- a/egs/hub4_english/s5/local/run_cleanup_segmentation.sh
+++ b/egs/hub4_english/s5/local/run_cleanup_segmentation.sh
@@ -10,12 +10,6 @@
 # biased language model built from the reference, and then work out the
 # segmentation from a ctm like file.
 
-# For nnet3 and chain results after cleanup, see the scripts in
-# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
-
-# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
-# [will add these later].
-
 stage=0
 cleanup_stage=0
 data=data/train
@@ -80,10 +74,10 @@ fi
 if [ $stage -le 6 ]; then
   steps/train_sat.sh --cmd "$train_cmd" \
     5000 100000 $cleaned_data data/lang_nosp \
-    ${cleaned_dir}_ali_${cleanup_affix} exp/tri4b_${cleanup_affix}
+    ${cleaned_dir}_ali_${cleanup_affix} exp/tri4_${cleanup_affix}
 fi
 
-cleaned_dir=exp/tri4b_${cleanup_affix}
+cleaned_dir=exp/tri4_${cleanup_affix}
 if [ $stage -le 7 ]; then
   # Test with the larger model trained on cleaned-up data.
   utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
diff --git a/egs/hub4_english/s5/local/train_lm.sh b/egs/hub4_english/s5/local/train_lm.sh
index d4d62403194..24b53759956 100755
--- a/egs/hub4_english/s5/local/train_lm.sh
+++ b/egs/hub4_english/s5/local/train_lm.sh
@@ -38,7 +38,7 @@ export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
 ) || exit 1;
 
 num_dev_sentences=4500
-RANDOM=0
+RANDOM=0  # set seed for shuffling to ensure reproducibility
 
 if [ $stage -le 0 ]; then
   mkdir -p ${dir}/data
@@ -50,14 +50,15 @@ if [ $stage -le 0 ]; then
 
   # Take unique subset to make sure that the training text is not in the 
   # dev set.
-  cat data/train_bn96/text | cut -d ' ' -f 2- | sort | uniq -c | \
-    shuf > ${dir}/train_bn96_text_with_count
-  head -n $num_dev_sentences < ${dir}/train_bn96_text_with_count | \
+  # Replace train with train_bn96 in order to use only the 1996 HUB4 set
+  cat data/train/text | cut -d ' ' -f 2- | sort | uniq -c | \
+    shuf > ${dir}/train_text_with_count
+  head -n $num_dev_sentences < ${dir}/train_text_with_count | \
     awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
     ${dir}/data/text/dev.txt 
-  tail -n +$[num_dev_sentences+1] < ${dir}/train_bn96_text_with_count | \
+  tail -n +$[num_dev_sentences+1] < ${dir}/train_text_with_count | \
     awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
-    ${dir}/data/text/train_bn96.txt
+    ${dir}/data/text/train.txt
 
   # Get text from NA News corpus 
   for x in data/local/data/na_news/*; do
@@ -110,8 +111,8 @@ if [ $stage -le 2 ]; then
   #   [ -f $y ] && cat $y 
   # done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts
 
-  cat $dir/data/work/word_counts/{train_bn96,dev}.counts | \
-    local/lm/merge_word_counts.py 2 > $dir/data/work/train_bn96.wordlist_counts
+  cat $dir/data/work/word_counts/{train,dev}.counts | \
+    local/lm/merge_word_counts.py 2 > $dir/data/work/train.wordlist_counts
 
   cat $dir/data/work/word_counts/csr96_hub4.counts | \
     local/lm/merge_word_counts.py 5 > $dir/data/work/csr96_hub4.wordlist_counts
@@ -119,19 +120,14 @@ if [ $stage -le 2 ]; then
   cat $dir/data/work/word_counts/csr95_hub4.counts | \
     local/lm/merge_word_counts.py 5 > $dir/data/work/csr95_hub4.wordlist_counts
 
-  cat $dir/data/work/{train_bn96,csr96_hub4,csr95_hub4}.wordlist_counts | \
+  cat $dir/data/work/{train,csr96_hub4,csr95_hub4}.wordlist_counts | \
     perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[0] $F[1]\n"; }' | \
     local/lm/merge_word_counts.py 1 | sort -k 1,1nr > $dir/data/work/final.wordlist_counts
 
   if [ ! -z "$vocab_size" ]; then
     awk -v sz=$vocab_size 'BEGIN{count=-1;} 
-    { i+=1; 
-      if (i == int(sz)) { 
-        count = $1; 
-      };
-      if (count > 0 && count != $1) { 
-        exit(0); 
-      } 
+    { i+=1; if (i == int(sz)) { count = $1; };
+      if (count > 0 && count != $1) { exit(0); } 
       print $0;
     }' $dir/data/work/final.wordlist_counts
   else 
@@ -142,7 +138,7 @@ fi
 order=4
 wordlist=$dir/data/work/wordlist
 
-min_counts='default=5 train_bn96=1 csr96_hub4=2,3 csr95_hub4=2,3'
+min_counts='default=5 train=1 csr96_hub4=2,3 csr95_hub4=2,3'
 
 lm_name="`basename ${wordlist}`_${order}"
 if [ -n "${min_counts}" ]; then
@@ -158,7 +154,7 @@ if [ $stage -le 3 ]; then
   $cmd ${unpruned_lm_dir}/log/train.log \
     train_lm.py  --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20  \
                  --limit-unk-history=true \
-                 --fold-dev-into=train_bn96 \
+                 --fold-dev-into=train \
                  --min-counts="${min_counts}" \
                  ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
 
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
index a1afc6db644..5db61d4eb10 100755
--- a/egs/hub4_english/s5/run.sh
+++ b/egs/hub4_english/s5/run.sh
@@ -63,7 +63,8 @@ if [ $stage -le 0 ]; then
     data/local/data/train_bn97
 fi
 
-# Install Beautiful Soup 4 python package
+# Install Beautiful Soup 4 python package for parsing SGML-like files
+# in CSR-IV HUB4 corpus
 if [ ! -d tools/beautifulsoup4 ]; then
   mkdir -p tools
   pip install -t tools/beautifulsoup4 beautifulsoup4
diff --git a/egs/wsj/s5/steps/dict/train_g2p.sh b/egs/wsj/s5/steps/dict/train_g2p.sh
index 1170d8833df..d5599b14df4 100755
--- a/egs/wsj/s5/steps/dict/train_g2p.sh
+++ b/egs/wsj/s5/steps/dict/train_g2p.sh
@@ -44,8 +44,10 @@ mkdir -p $wdir/log
 
 # Optionally remove words that are mapped to a single silence phone from the lexicon.
 if $only_words && [ ! -z "$silence_phones" ]; then
-  awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i;a[$1]=s;if(!(s in a)) print $1" "s}' \
-    $lexicon $silence_phones > $wdir/lexicon_onlywords.txt
+  awk -v w=$silence_phones \
+    'BEGIN{while((getline<w)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}}
+    { p=$2; for(i=3;i<=NF;i++) p=p" "$i; 
+      if(!(p in sil)) print $1" "p }' $lexicon > $wdir/lexicon_onlywords.txt
   lexicon=$wdir/lexicon_onlywords.txt
 fi
 
diff --git a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
index 917f4c9728d..5db6be731ce 100755
--- a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
+++ b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
@@ -1,6 +1,6 @@
 #! /bin/bash
 
-# Copyright 2016-18  Vimal Manohar
+# Copyright 2016-2018  Vimal Manohar
 # Apache 2.0
 
 # This scripts converts a data directory into a "whole" data directory

From da533671f5c76bc66a25751d6008e06fe3f7c289 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 12 Feb 2018 17:18:47 -0500
Subject: [PATCH 37/38] Fixing train_g2p.sh

---
 egs/wsj/s5/steps/dict/train_g2p.sh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/egs/wsj/s5/steps/dict/train_g2p.sh b/egs/wsj/s5/steps/dict/train_g2p.sh
index d5599b14df4..d793bbb5d8f 100755
--- a/egs/wsj/s5/steps/dict/train_g2p.sh
+++ b/egs/wsj/s5/steps/dict/train_g2p.sh
@@ -44,10 +44,9 @@ mkdir -p $wdir/log
 
 # Optionally remove words that are mapped to a single silence phone from the lexicon.
 if $only_words && [ ! -z "$silence_phones" ]; then
-  awk -v w=$silence_phones \
-    'BEGIN{while((getline<w)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}}
-    { p=$2; for(i=3;i<=NF;i++) p=p" "$i; 
-      if(!(p in sil)) print $1" "p }' $lexicon > $wdir/lexicon_onlywords.txt
+  awk -v s=$silence_phones \
+    'BEGIN{while((getline<s)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}}
+    {if (!(NF == 2 && $2 in sil)) print;}' $lexicon > $wdir/lexicon_onlywords.txt
   lexicon=$wdir/lexicon_onlywords.txt
 fi
 

From f53a18aa79172e5af33578be395dd29894d9b599 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 13 Feb 2018 16:46:37 -0500
Subject: [PATCH 38/38] Fixing some lm scripts

---
 .../process_1996_csr_hub4_lm_filelist.py      |  1 +
 egs/hub4_english/s5/local/prepare_dict.sh     |  5 +-
 egs/hub4_english/s5/local/train_lm.sh         | 62 ++++++++-----------
 3 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
index 04d50e4343c..95aa7ddb831 100755
--- a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
+++ b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
@@ -57,6 +57,7 @@ def normalize_text(text, remove_punct=False):
     text1 = text.strip()
     text2 = text1.upper()
     text2 = re.sub(r" [ ]*", " ", text2)
+    text2 = re.sub(r"([A-Z][A-Z])[.!,;]\s", "\1", text2)  # remove punctuations
     return text2
 
 
diff --git a/egs/hub4_english/s5/local/prepare_dict.sh b/egs/hub4_english/s5/local/prepare_dict.sh
index a44be00f374..3f53ec6af74 100755
--- a/egs/hub4_english/s5/local/prepare_dict.sh
+++ b/egs/hub4_english/s5/local/prepare_dict.sh
@@ -60,8 +60,9 @@ if [ ! -d $dir/cmudict ]; then
     $dir/cmudict || exit 1;
 fi
 
-# can add -r 10966 for strict compatibility.
+cp $wordlist $dir/orig_wordlist
 
+# can add -r 10966 for strict compatibility.
 
 #(2) Dictionary preparation:
 
@@ -122,7 +123,7 @@ fi
 export PATH=$PATH:`pwd`/local/dict
 
 if [ $stage -le 3 ]; then
-  utils/filter_scp.pl --exclude $dir/wordlist_with_prons < $wordlist | \
+  utils/filter_scp.pl --exclude $dir/wordlist_with_prons < $dir/orig_wordlist | \
     sort -u > $dir/oovlist
 fi
 
diff --git a/egs/hub4_english/s5/local/train_lm.sh b/egs/hub4_english/s5/local/train_lm.sh
index 24b53759956..4378a287d42 100755
--- a/egs/hub4_english/s5/local/train_lm.sh
+++ b/egs/hub4_english/s5/local/train_lm.sh
@@ -165,18 +165,17 @@ if [ $stage -le 3 ]; then
     cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity'
   done
   
-  # train_lm.py: Ngram counts: 190742 + 31139856 + 14766071 + 13851899 = 59948568
-  # train_lm.py: You can set --bypass-metaparameter-optimization='1.000,0.007,0.000,0.002,0.000,0.006,0.003,0.000,0.000,0.000,0.001,0.002,0.002,0.000,0.000,0.000,0.003,0.000,0.000,0.604,0.187,0.044,0.012,1.000,0.490,0.026,0.001,0.732,0.328,0.281,0.218' to get equivalent results
-  # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/wordlist_4_default-5_bn-1.pocolm was -4.9927348506 per word [perplexity = 147.338822662] over 33180.0 words.
-
-  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.92985727862 per word [perplexity = 138.359764034] over 23760.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88171588624 per word [perplexity = 131.85672102] over 18821.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.85089075845 per word [perplexity = 127.85422637] over 20625.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.84370861758 per word [perplexity = 126.939248987] over 33340.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.91000862327 per word [perplexity = 135.640584068] over 33180.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.03738768271 per word [perplexity = 154.067016944] over 11529.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.02574438024 per word [perplexity = 152.283570813] over 16395.0 words.
-
+  # train_lm.py: You can set --bypass-metaparameter-optimization='0.829,0.997,0.066,0.014,0.171,0.244,0.063,0.001,0.023,0.004,0.014,0.006,0.018,0.027,0.082,1.000,0.004,0.007,0.024,0.703,0.108,0.046,0.019,0.848,0.258,0.208,0.195,0.889,0.297,0.282,0.242' to get equivalent results
+  # train_lm.py: Ngram counts: 98768 + 26286404 + 21077207 + 17945418 = 65407797
+  
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88365261291 per word [perplexity = 132.112338899] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.9299451353 per word [perplexity = 138.371920398] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.8308081807 per word [perplexity = 125.312194639] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.82377287988 per word [perplexity = 124.433679586] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88114977878 per word [perplexity = 131.782097071] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.01175279868 per word [perplexity = 150.167719384] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.01485733132 per word [perplexity = 150.634644387] over 16395.0 words.
+  
 fi
 
 if [ $stage -le 4 ]; then
@@ -194,18 +193,13 @@ if [ $stage -le 4 ]; then
     cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log | grep -F '[perplexity'
   done
 
-  # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big
-  # grep -F '[perplexity'
-  # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big was -5.05700399638 per word [perplexity = 157.11908113]
-  # over 33180.0 words.
-
-  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_big was -5.00197658249 per word [perplexity = 148.706800062] over 23760.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_big was -4.95522131024 per word [perplexity = 141.914009921] over 18821.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_big was -4.91668501333 per word [perplexity = 136.54920329] over 20625.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_big was -4.92810468806 per word [perplexity = 138.117488385] over 33340.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_big was -4.98326999699 per word [perplexity = 145.950861062] over 33180.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10923357186 per word [perplexity = 165.543429098] over 11529.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10475193474 per word [perplexity = 164.803183515] over 16395.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_big was -4.96695051249 per word [perplexity = 143.588348177] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_big was -5.01232680304 per word [perplexity = 150.253941052] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_big was -4.91227395027 per word [perplexity = 135.948202644] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_big was -4.92411302883 per word [perplexity = 137.567269311] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_big was -4.97443821579 per word [perplexity = 144.667530381] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10483206523 per word [perplexity = 164.816389804] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10905926136 per word [perplexity = 165.514575655] over 16395.0 words.
 
   mkdir -p ${dir}/data/arpa
   format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
@@ -228,19 +222,13 @@ if [ $stage -le 5 ]; then
     cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log | grep -F '[perplexity'
   done
 
-  # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small
-  # grep -F '[perplexity'
-  # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small was -5.27172473478 per word [perplexity = 194.751567749] over 33180.0 words.
-  # float-counts-to-pre-arpa: output [ 190743 673670 802551 351512 ] n-grams
-
-  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_small was -5.15402161616 per word [perplexity = 173.126339858] over 23760.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_small was -5.10689797354 per word [perplexity = 165.157237313] over 18821.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_small was -5.07740442667 per word [perplexity = 160.357296176] over 20625.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_small was -5.09747614277 per word [perplexity = 163.608461382] over 33340.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_small was -5.13563068716 per word [perplexity = 169.971484911] over 33180.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26596417642 per word [perplexity = 193.632915104] over 11529.0 words.
-  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26092885453 per word [perplexity = 192.660361662] over 16395.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_small was -5.12459372596 per word [perplexity = 168.105830741] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_small was -5.16866547448 per word [perplexity = 175.680231224] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_small was -5.08096906048 per word [perplexity = 160.929931226] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_small was -5.09222677679 per word [perplexity = 162.751870937] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_small was -5.12842796263 per word [perplexity = 168.751625556] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26755997571 per word [perplexity = 193.942161054] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_small was -5.27092234584 per word [perplexity = 194.595363921] over 16395.0 words
 
   format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
 fi
-