diff --git a/egs/wsj/s5/steps/cleanup/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/get_ctm_edits.py
deleted file mode 100755
index ee75cfb4dfe..00000000000
--- a/egs/wsj/s5/steps/cleanup/get_ctm_edits.py
+++ /dev/null
@@ -1,352 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2016   Vimal Manohar
-#           2016   Johns Hopkins University (author: Daniel Povey)
-# Apache 2.0
-
-from __future__ import print_function
-import sys, operator, argparse
-
-# Modify the CTM to include for each token the information from Levenshtein
-# alignment of 'hypothesis' and 'reference'
-# (i.e. the output of 'align-text'.
-
-# The information added to each token in the CTM is the reference word and one
-# of the following edit-types:
-#  'cor' = correct  [note: as a special case we count as correct cases where
-#                    the hypothesis word is the OOV symbol and the reference
-#                    word is OOV w.r.t. the supplied vocabulary.]
-#  'sub' = substitution
-#  'del' = deletion
-#  'ins' = insertion
-#  'sil' = (silence in ctm; does not consume a reference word)
-# note: the script modify_ctm_edits.py will add the new
-# note: the following extra edit-type may be added by modify_ctm_edits.py:
-#  'fix'  ... this is like 'cor', but it means the reference has been modified
-#             to fix non-scoreable errors [typically errors that don't change the
-#             meaning], so we don't trust the word or value it as much as a 'cor'.
-#
-
-# Note: Additional lines are added to the CTM to account for deletions.
-
-# Input CTM:
-# (note: the <eps> is for silence in the input CTM that comes from
-# optional-silence in the graph.  However, the input edits don't have anything
-# for these silences.
-# We assume (and check) that the channel will always be '1', because the
-# input CTMs are expected to be 'per utterance', not including real
-# recording-ids.
-
-# Input ctm format:
-# <file-id> <channel> <start-time> <duration> <hyp-word> [<confidence>]
-# note, the confidence defaults to 1 if not provided (these
-# scripts don't actually use the confidence field).
-
-## TimBrown_2008P-0007226-0007620 1 0.000 0.100 when
-## TimBrown_2008P-0007226-0007620 1 0.100 0.090 i
-## TimBrown_2008P-0007226-0007620 1 0.190 0.300 some
-## TimBrown_2008P-0007226-0007620 1 0.490 0.110 when
-## TimBrown_2008P-0007226-0007620 1 0.600 0.060 i
-## TimBrown_2008P-0007226-0007620 1 0.660 0.190 say
-## TimBrown_2008P-0007226-0007620 1 0.850 0.450 go
-## TimBrown_2008P-0007226-0007620 1 1.300 0.310 [COUGH]
-## TimBrown_2008P-0007226-0007620 1 1.610 0.130 you
-## TimBrown_2008P-0007226-0007620 1 1.740 0.180 got
-## TimBrown_2008P-0007226-0007620 1 1.920 0.370 thirty
-## TimBrown_2008P-0007226-0007620 1 2.290 0.830 seconds
-## TimBrown_2008P-0007226-0007620 1 3.120 0.330 <eps>
-## TimBrown_2008P-0007226-0007620 1 3.450 0.040 [BREATH]
-## TimBrown_2008P-0007226-0007620 1 3.490 0.110 to
-## TimBrown_2008P-0007226-0007620 1 3.600 0.320 [NOISE]
-
-# Input Levenshtein edits : (the output of 'align-text' post-processed by 'wer_per_utt_details.pl')
-
-# AJJacobs_2007P-0001605-0003029 i i ; thought thought ; i'd i'd ; tell tell ; you you ; a a ; little little ; about about ; [UH] [UH] ; what what ; i i ; like like ; to to ; write write ; and and ; [UH] [UH] ; i i ; like like ; to to ; [UH] [UH] ; immerse immerse ; myself myself ; [SMACK] [SMACK] ; in in ; my my ; topics topics ; [UM] [UM] ; i i ; just just ; like like ; to to ; [UH] [UH] ; dive dive ; [SMACK] [SMACK] ; right right ; in in ; and and ; become become ; [UH] [UH] ; sort sort ; of of ; a a ; human human ; guinea guinea ; pig pig ; [BREATH] [BREATH] ; and and ; [UH] [UH]
-# AJJacobs_2007P-0003133-0004110 i i ; see see ; my my ; life life ; as as ; a a ; series series ; of of ; experiments experiments ; [BREATH] [BREATH] ; so so ; [UH] [UH] ; i i ; [NOISE] [NOISE] ; work work ; for for ; esquire esquire ; magazine magazine ; <eps> and ; a a ; couple couple ; of of ; years years ; ago ago ; [BREATH] [BREATH] ; i i ; wrote wrote ; an an ; article article ; called called ; [NOISE] [NOISE] ; my my ; outsourced outsourced ; life life
-
-
-# Output format:
-# <file-id> <channel> <start-time> <duration> <hyp-word> <confidence> <ref-word> <edit-type>
-
-# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil
-# AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor
-# AJJacobs_2007P-0001605-0003029 1 0.24 0.25 thought 1.0 thought cor
-# AJJacobs_2007P-0001605-0003029 1 0.49 0.14 i'd 1.0 i'd cor
-# AJJacobs_2007P-0001605-0003029 1 0.63 0.22 tell 1.0 tell cor
-# AJJacobs_2007P-0001605-0003029 1 0.85 0.11 you 1.0 you cor
-# AJJacobs_2007P-0001605-0003029 1 0.96 0.05 a 1.0 a cor
-# AJJacobs_2007P-0001605-0003029 1 1.01 0.24 little 1.0 little cor
-# AJJacobs_2007P-0001605-0003029 1 1.25 0.5 about 1.0 about cor
-# AJJacobs_2007P-0001605-0003029 1 1.75 0.48 [UH] 1.0 [UH] cor
-# AJJacobs_2007P-0001605-0003029 1 2.23 0.34 <eps> 1.0 <eps> sil
-# AJJacobs_2007P-0001605-0003029 1 2.57 0.21 what 1.0 what cor
-# AJJacobs_2007P-0001605-0003029 1 2.78 0.1 i 1.0 i cor
-# AJJacobs_2007P-0001605-0003029 1 2.88 0.22 like 1.0 like cor
-# AJJacobs_2007P-0001605-0003029 1 3.1 0.13 to 1.0 to cor
-# AJJacobs_2007P-0001605-0003029 1 3.23 0.37 write 1.0 write cor
-# AJJacobs_2007P-0001605-0003029 1 3.6 0.03 <eps> 1.0 <eps> sil
-# AJJacobs_2007P-0001605-0003029 1 3.63 0.36 and 1.0 and cor
-
-
-
-parser = argparse.ArgumentParser(
-    description = "Append to the CTM the Levenshtein alignment of 'hypothesis' and 'reference'; "
-    "creates augmented CTM with extra fields (see script for details)")
-
-parser.add_argument("--oov", type = int, default = -1,
-                    help = "The integer representation of the OOV symbol; substitutions "
-                    "by the OOV symbol for out-of-vocabulary reference words are treated "
-                    "as correct, if you also supply the --symbol-table option.")
-parser.add_argument("--symbol-table", type = str,
-                    help = "The words.txt your system used; if supplied, it is used to "
-                    "determine OOV words (and such words will count as correct if "
-                    "substituted by the OOV symbol).  See also the --oov option")
-# Required arguments
-parser.add_argument("edits_in", metavar = "<edits-in>",
-                    help = "Filename of output of 'align-text', which this program reads. "
-                    "Use /dev/stdin for standard input.")
-parser.add_argument("ctm_in", metavar = "<ctm-in>",
-                    help = "Filename of input hypothesis in ctm format")
-parser.add_argument("ctm_edits_out", metavar = "<ctm-edits-out>",
-                    help = "Filename of output (CTM appended with word-edit information)")
-args = parser.parse_args()
-
-
-
-def OpenFiles():
-    global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word
-    try:
-        ctm_edits_out = open(args.ctm_edits_out, 'w')
-    except:
-        sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format(
-                args.ctm_edits_out))
-    try:
-        edits_in = open(args.edits_in)
-    except:
-        sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format(
-                args.edits_in))
-    try:
-        ctm_in = open(args.ctm_in)
-    except:
-        sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format(
-                args.ctm_in))
-
-    symbol_table = set()
-    oov_word = None
-    if args.symbol_table != None:
-        if args.oov == -1:
-            print("get_ctm_edits.py: error: if you set the the --symbol-table option "
-                  "you must also set the --oov option", file = sys.stderr)
-        try:
-            f = open(args.symbol_table, 'r')
-            for line in f.readlines():
-                [ word, integer ] = line.split()
-                if int(integer) == args.oov:
-                    oov_word = word
-                symbol_table.add(word)
-        except:
-            sys.exit("get_ctm_edits.py: error opening symbol-table file {0} for "
-                     "input (or bad file), exception is: {1}".format(args.symbol_table))
-        f.close()
-        if oov_word == None:
-            sys.exit("get_ctm_edits.py: OOV word not found: check the values of "
-                     "--symbol-table={0} and --oov={1}".format(args.symbol_table,
-                                                               args.oov))
-
-# This function takes two lists
-# edits_array = [ [ hyp_word1, ref_word1], [ hyp_word2, ref_word2 ], ... ]
-# ctm_array = [ [ start1, duration1, hyp_word1, confidence1 ], ... ]
-#
-# and pads them with new list elements so that the entries 'match up'.  What we
-# are aiming for is that for each i, ctm_array[i][2] == edits_array[i][0].  The
-# reasons why this is not automatically true are:
-#
-#  (1) There may be deletions in the hypothesis sequence, which would lead to
-#      pairs like [ '<eps>', ref_word ].
-#  (2) The ctm may have been written 'with silence', which will lead to
-#      ctm entries like [ 1, 7.8, 0.9, '<eps>' ] where the '<eps>' refers
-#      to the optional-silence from the lexicon.
-#
-# We introduce suitable entries in to edits_array and ctm_array as necessary
-# to make them 'match up'.  This function returns the pair (new_edits_array,
-# new_ctm_array).
-def PadArrays(edits_array, ctm_array):
-    new_edits_array = []
-    new_ctm_array = []
-    edits_len = len(edits_array)
-    ctm_len = len(ctm_array)
-    edits_pos = 0
-    ctm_pos = 0
-    # current_time is the end of the last ctm segment we processesed.
-    current_time = ctm_array[0][0] if ctm_len > 0 else 0.0
-    while edits_pos < edits_len or ctm_pos < ctm_len:
-        if edits_pos < edits_len and ctm_pos < ctm_len and \
-                edits_array[edits_pos][0] == ctm_array[ctm_pos][2] and \
-                edits_array[edits_pos][0] != '<eps>':
-            # This is the normal case, where there are 2 entries where
-            # they hyp-words match up
-            new_edits_array.append(edits_array[edits_pos])
-            edits_pos += 1
-            new_ctm_array.append(ctm_array[ctm_pos])
-            current_time = ctm_array[ctm_pos][0] + ctm_array[ctm_pos][1]
-            ctm_pos += 1
-        elif edits_pos < edits_len and edits_array[edits_pos][0] == '<eps>':
-            # There was a deletion.  Pad with an empty ctm segment with '<eps>' as
-            # the word.
-            new_edits_array.append(edits_array[edits_pos])
-            edits_pos += 1
-            duration = 0.0
-            confidence = 1.0
-            new_ctm_array.append([ current_time, duration, '<eps>', confidence])
-        elif ctm_pos < ctm_len and ctm_array[ctm_pos][2] == '<eps>':
-            # There was silence in the ctm, and either we're reached the end of the
-            # edits sequence, or the hyp word was not '<eps>':
-
-            new_edits_array.append(['<eps>', '<eps>'])
-            new_ctm_array.append(ctm_array[ctm_pos])
-            current_time = ctm_array[ctm_pos][0] + ctm_array[ctm_pos][1]
-            ctm_pos += 1
-        else:
-            raise Exception("Could not align edits_array = {0} and ctm_array = {1}; "
-                            "edits-position = {2}, ctm-position = {3}, "
-                            "pending-edit={4}, pending-ctm-entry={5}".format(
-                    edits_array, ctm_array, edits_pos, ctm_pos,
-                    edits_array[edits_pos] if edits_pos < edits_len else None,
-                    ctm_array[ctm_pos] if ctm_pos < ctm_len else None))
-    assert len(new_edits_array) == len(new_ctm_array)
-    return (new_edits_array, new_ctm_array)
-
-
-# This function returns the appropriate edit-type to output in the ctm-edits
-# file.  The ref_word and hyp_word and duration are the values we'll print in
-# the ctm-edits file.
-def GetEditType(hyp_word, ref_word, duration):
-    global oov_word
-    if hyp_word == ref_word and hyp_word !='<eps>':
-        return 'cor'
-    elif hyp_word != '<eps>' and ref_word == '<eps>':
-        return 'ins'
-    elif hyp_word == '<eps>' and ref_word != '<eps>' and duration == 0.0:
-        return 'del'
-    elif hyp_word == oov_word and \
-         len(symbol_table) != 0 and not ref_word in symbol_table:
-        return 'cor'   # this special case is treated as correct.
-    elif hyp_word == '<eps>' == ref_word and duration > 0.0:
-        # silence in hypothesis; we don't match this up with any reference word.
-        return 'sil'
-    else:
-        # The following assertion is because, based on how PadArrays
-        # works, we shouldn't hit this case.
-        assert hyp_word != '<eps>' and ref_word != '<eps>'
-        return 'sub'
-
-# this prints a number with a certain number of digits after
-# the point, while removing trailing zeros.
-def FloatToString(f):
-    num_digits = 6 # we want to print 6 digits after the zero
-    g = f
-    while abs(g) > 1.0:
-        g *= 0.1
-        num_digits += 1
-    format_str = '%.{0}g'.format(num_digits)
-    return format_str % f
-
-
-def OutputCtm(utterance_id, edits_array, ctm_array):
-    global ctm_edits_out
-    # note: this function expects the padded entries created by PadARrays.
-    assert len(edits_array) == len(ctm_array)
-    channel = '1'  # this is hardcoded at both input and output, since this CTM
-                   # doesn't really represent recordings, only utterances.
-    for i in range(len(edits_array)):
-        ( hyp_word, ref_word ) = edits_array[i]
-        ( start_time, duration, hyp_word2, confidence ) = ctm_array[i]
-        if not hyp_word == hyp_word2:
-            print("Error producing output CTM for edit = {0} and ctm = {1}".format(
-                    edits_array[i], ctm_array[i]), file = sys.stderr)
-            sys.exit(1)
-        assert hyp_word == hyp_word2
-        edit_type = GetEditType(hyp_word, ref_word, duration)
-        print(utterance_id, channel, FloatToString(start_time),
-              FloatToString(duration), hyp_word, confidence, ref_word,
-              edit_type, file = ctm_edits_out)
-
-
-def ProcessOneUtterance(utterance_id, edits_line, ctm_lines):
-    try:
-        # Remove the utterance-id from the beginning of the edits line
-        edits_fields = edits_line[len(utterance_id) + 1:]
-
-        # e.g. if edits_fields is now 'i i ; see be ; my my ', edits_array will become
-        #  [ ['i', 'i'], ['see', 'be'], ['my', 'my'] ]
-        edits_array = [ x.split() for x in edits_fields.split(";") ]
-        # The lines below are a fix for when we get empty transcripts and reference, hence
-        # just whitespace in 'edits_fields'.
-        if edits_array == [[]]:
-            edits_array =  []
-        try:
-            for x in edits_array:
-                assert len(x) == 2
-        except:
-            sys.exit("get_ctm_edits.py: could not make sense of edits line: " + edits_line)
-
-        # ctm_array will now become something like [ ['1', '1.010', '0.240', 'little ' ], ... ]
-        ctm_array = [ x.split() for x in ctm_lines ]
-        ctm_array = []
-        for line in ctm_lines:
-            try:
-                # Strip off the utterance-id and split the remaining fields
-                # which should be: channel==1, start, dur, word, [confidence]
-                a = line[len(utterance_id) + 1:].split()
-                if len(a) == 4:
-                    a.append(1.0)  # confidence defaults to 1.0.
-                [ channel, start, dur, word, confidence ] = a
-                if channel != '1':
-                    raise Exception("Channel should be 1, got: " + channel)
-                ctm_array.append([ float(start), float(dur), word, float(confidence) ])
-            except Exception as e:
-                sys.exit("get_ctm_edits.py: error procesing ctm line {0} "
-                         "... exception is: {1} {2}".format(line, type(e), str(e)))
-        # ctm_array will now be something like [ [ 1.010, 0.240, 'little ', 1.0 ], ... ]
-
-        # The following call pads the edits and ctm arrays with appropriate
-        # entries so that they have the same length and the elements 'match up'.
-        (edits_array, ctm_array) = PadArrays(edits_array, ctm_array)
-    except Exception as e:
-        sys.exit("get_ctm_edits.py: error processing utterance {0}, error was: {1}".format(
-                utterance_id, str(e)))
-    OutputCtm(utterance_id, edits_array, ctm_array)
-
-def ProcessData():
-    num_utterances_processed = 0
-
-    pending_ctm_line = ctm_in.readline()
-
-    while True:
-        this_edits_line = edits_in.readline()
-        if this_edits_line == '':
-            if pending_ctm_line != '':
-                sys.exit("get_ctm_edits.py: edits_in input {0} ended before "
-                         "ctm input was ended.  We processed {1} "
-                         "utterances.".format(args.edits_in, num_utterances_processed))
-            break
-        a = this_edits_line.split()
-        if len(a) == 0:
-            sys.exit("get_ctm_edits.py: edits_input {0} had an empty line".format(
-                    args.edits_in))
-        utterance_id = a[0]
-        utterance_id_len = len(utterance_id)
-        this_utterance_ctm_lines = []
-        while pending_ctm_line[0:utterance_id_len] == utterance_id:
-            this_utterance_ctm_lines.append(pending_ctm_line)
-            pending_ctm_line = ctm_in.readline()
-        ProcessOneUtterance(utterance_id, this_edits_line,
-                            this_utterance_ctm_lines)
-        num_utterances_processed += 1
-    print("get_ctm_edits.py: processed {0} utterances".format(
-            num_utterances_processed), file=sys.stderr)
-
-
-OpenFiles()
-ProcessData()
-
diff --git a/egs/wsj/s5/steps/cleanup/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/get_non_scored_words.py
deleted file mode 100755
index 2ffdf3b7f94..00000000000
--- a/egs/wsj/s5/steps/cleanup/get_non_scored_words.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2016   Vimal Manohar
-#           2016   Johns Hopkins University (author: Daniel Povey)
-# Apache 2.0
-
-from __future__ import print_function
-import sys, operator, argparse, os
-from collections import defaultdict
-
-# If you supply the <lang> directory (the one that corresponds to
-# how you decoded the data) to this script, it assumes that the <lang>
-# directory contains phones/align_lexicon.int, and it uses this to work
-# out a reasonable guess of the non-scored phones, based on which have
-# a single-word pronunciation that maps to a silence phone.
-# It then uses the words.txt to work out the written form of those words.
-
-parser = argparse.ArgumentParser(
-    description = "This program works out a reasonable guess at a list of "
-    "non-scored words (words that won't affect the WER evaluation): "
-    "things like [COUGH], [NOISE] and so on.  This is useful because a list of "
-    "such words is required by some other scripts (e.g. modify_ctm_edits.py), "
-    "and it's inconvenient to have to specify the list manually for each language. "
-    "This program writes out the words in text form, one per line.")
-
-parser.add_argument("lang", type = str,
-                    help = "The lang/ directory.  This program expects "
-                    "lang/words.txt and lang/phones/silence.int and "
-                    "lang/phones/align_lexicon.int to exist, and will use them to work "
-                    "out a reasonable guess of the non-scored words  (as those whose "
-                    "pronunciations are a single phone in the 'silphones' list)")
-
-args = parser.parse_args()
-
-non_scored_words = set()
-
-
-def ReadLang(lang_dir):
-    global non_scored_words
-
-    if not os.path.isdir(lang_dir):
-        sys.exit("modify_ctm_edits.py expected lang/ directory {0} to "
-                 "exist.".format(lang_dir))
-    for f in [ '/words.txt', '/phones/silence.int', '/phones/align_lexicon.int' ]:
-        if not os.path.exists(lang_dir + f):
-            sys.exit("modify_ctm_edits.py: expected file {0}{1} to exist.".format(
-                    lang_dir, f))
-    # read silence-phones.
-    try:
-        silence_phones = set()
-        for line in open(lang_dir + '/phones/silence.int').readlines():
-            silence_phones.add(int(line))
-    except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
-                 "{0}/phones/silence.int: {1}".format(lang_dir, str(e)))
-
-    # read align_lexicon.int.
-    # format is: <word-index> <word-index> <phone-index1> <phone-index2> ..
-    # We're looking for line of the form:
-    # w w p
-    # where w > 0 and p is in the set 'silence_phones'
-    try:
-        silence_word_ints = set()
-        for line in open(lang_dir + '/phones/align_lexicon.int').readlines():
-            a = line.split()
-            if len(a) == 3 and a[0] == a[1] and int(a[0]) > 0 and \
-                    int(a[2]) in silence_phones:
-                silence_word_ints.add(int(a[0]))
-    except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
-                 "{0}/phones/align_lexicon.int: "
-                 "{1}".format(lang_dir, str(e)))
-
-    try:
-        for line in open(lang_dir + '/words.txt').readlines():
-            [ word, integer ] = line.split()
-            if int(integer) in silence_word_ints:
-                non_scored_words.add(word)
-    except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
-                 "{0}/words.txt.int: {1}".format(lang_dir, str(e)))
-
-    if not len(non_scored_words) == len(silence_word_ints):
-        sys.exit("modify_ctm_edits.py: error getting silence words, len({0}) != len({1})",
-                 str(non_scored_words), str(silence_word_ints))
-    for word in non_scored_words:
-        print(word)
-
-
-ReadLang(args.lang)
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
index 385b0c5c5dd..d0f762d1197 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
@@ -275,11 +275,20 @@ def OutputCtm(utterance_id, edits_array, ctm_array):
 def ProcessOneUtterance(utterance_id, edits_line, ctm_lines):
     try:
         # Remove the utterance-id from the beginning of the edits line
-        edits_line = edits_line[len(utterance_id) + 1:]
+        edits_fields = edits_line[len(utterance_id) + 1:]
 
-        # e.g. if edits_line is now 'i i ; see be ; my my ', edits_array will become
+        # e.g. if edits_fields is now 'i i ; see be ; my my ', edits_array will become
         #  [ ['i', 'i'], ['see', 'be'], ['my', 'my'] ]
-        edits_array = [ x.split() for x in edits_line.split(";") ]
+        fields_split = edits_fields.split()
+        first_fields, second_fields = fields_split[0::3], fields_split[1::3]
+        if (
+            len(first_fields) != len(second_fields) or
+            (len(fields_split) >= 3 and set(fields_split[2::3]) != {';'})
+        ):
+            sys.exit("get_ctm_edits.py: could not make sense of edits line: " + edits_line)
+
+        edits_array = list(zip(first_fields, second_fields))
+
         # ctm_array will now become something like [ ['1', '1.010', '0.240', 'little ' ], ... ]
         ctm_array = [ x.split() for x in ctm_lines ]
         ctm_array = []
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
index 2ffdf3b7f94..84d1ca0fbf6 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
@@ -39,11 +39,11 @@ def ReadLang(lang_dir):
     global non_scored_words
 
     if not os.path.isdir(lang_dir):
-        sys.exit("modify_ctm_edits.py expected lang/ directory {0} to "
+        sys.exit("get_non_scored_words.py expected lang/ directory {0} to "
                  "exist.".format(lang_dir))
     for f in [ '/words.txt', '/phones/silence.int', '/phones/align_lexicon.int' ]:
         if not os.path.exists(lang_dir + f):
-            sys.exit("modify_ctm_edits.py: expected file {0}{1} to exist.".format(
+            sys.exit("get_non_scored_words.py: expected file {0}{1} to exist.".format(
                     lang_dir, f))
     # read silence-phones.
     try:
@@ -51,7 +51,7 @@ def ReadLang(lang_dir):
         for line in open(lang_dir + '/phones/silence.int').readlines():
             silence_phones.add(int(line))
     except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
+        sys.exit("get_non_scored_words.py: problem reading file "
                  "{0}/phones/silence.int: {1}".format(lang_dir, str(e)))
 
     # read align_lexicon.int.
@@ -67,7 +67,7 @@ def ReadLang(lang_dir):
                     int(a[2]) in silence_phones:
                 silence_word_ints.add(int(a[0]))
     except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
+        sys.exit("get_non_scored_words.py: problem reading file "
                  "{0}/phones/align_lexicon.int: "
                  "{1}".format(lang_dir, str(e)))
 
@@ -77,11 +77,11 @@ def ReadLang(lang_dir):
             if int(integer) in silence_word_ints:
                 non_scored_words.add(word)
     except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
+        sys.exit("get_non_scored_words.py: problem reading file "
                  "{0}/words.txt.int: {1}".format(lang_dir, str(e)))
 
     if not len(non_scored_words) == len(silence_word_ints):
-        sys.exit("modify_ctm_edits.py: error getting silence words, len({0}) != len({1})",
+        sys.exit("get_non_scored_words.py: error getting silence words, len({0}) != len({1})",
                  str(non_scored_words), str(silence_word_ints))
     for word in non_scored_words:
         print(word)
diff --git a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
index 1022196a456..ea56219fe2a 100755
--- a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
@@ -52,7 +52,7 @@
 
 parser = argparse.ArgumentParser(
     description = "This program modifies the reference in the ctm-edits which "
-    "is output by steps/cleanup/get_ctm_edits.py, to allow insertions, deletions and "
+    "is output by steps/cleanup/internal/get_ctm_edits.py, to allow insertions, deletions and "
     "substitutions of non-scored words, and [if --allow-repetitions=true], "
     "duplications of single words or pairs of scored words (to account for dysfluencies "
     "that were not transcribed).  Note: deletions and substitutions of non-scored words "
diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
index 7901aca0320..57e9d6ab959 100755
--- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
@@ -912,23 +912,23 @@ def ProcessData():
     try:
         f_in = open(args.ctm_edits_in)
     except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
+        sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
         text_output_handle = open(args.text_out, 'w')
     except:
-        sys.exit("modify_ctm_edits.py: error opening text output "
+        sys.exit("segment_ctm_edits.py: error opening text output "
                  "file {0}".format(args.text_out))
     try:
         segments_output_handle = open(args.segments_out, 'w')
     except:
-        sys.exit("modify_ctm_edits.py: error opening segments output "
+        sys.exit("segment_ctm_edits.py: error opening segments output "
                  "file {0}".format(args.text_out))
     if args.ctm_edits_out != None:
         try:
             ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
         except:
-            sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
+            sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
                      "file {0}".format(args.ctm_edits_out))
 
     # Most of what we're doing in the lines below is splitting the input lines
@@ -936,10 +936,10 @@ def ProcessData():
     # and then printing the modified lines.
     first_line = f_in.readline()
     if first_line == '':
-        sys.exit("modify_ctm_edits.py: empty input")
+        sys.exit("segment_ctm_edits.py: empty input")
     split_pending_line = first_line.split()
     if len(split_pending_line) == 0:
-        sys.exit("modify_ctm_edits.py: bad input line " + first_line)
+        sys.exit("segment_ctm_edits.py: bad input line " + first_line)
     cur_utterance = split_pending_line[0]
     split_lines_of_cur_utterance = []
 
@@ -966,14 +966,14 @@ def ProcessData():
         split_pending_line = next_line.split()
         if len(split_pending_line) == 0:
             if next_line != '':
-                sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
+                sys.exit("segment_ctm_edits.py: got an empty or whitespace input line")
     try:
         text_output_handle.close()
         segments_output_handle.close()
         if args.ctm_edits_out != None:
             ctm_edits_output_handle.close()
     except:
-        sys.exit("modify_ctm_edits.py: error closing one or more outputs "
+        sys.exit("segment_ctm_edits.py: error closing one or more outputs "
                  "(broken pipe or full disk?)")
 
 
@@ -982,12 +982,12 @@ def ReadNonScoredWords(non_scored_words_file):
     try:
         f = open(non_scored_words_file)
     except:
-        sys.exit("modify_ctm_edits.py: error opening file: "
+        sys.exit("segment_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
     for line in f.readlines():
         a = line.split()
         if not len(line.split()) == 1:
-            sys.exit("modify_ctm_edits.py: bad line in non-scored-words "
+            sys.exit("segment_ctm_edits.py: bad line in non-scored-words "
                      "file {0}: {1}".format(non_scored_words_file, line))
         non_scored_words.add(a[0])
     f.close()
diff --git a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
index c763d7191a1..2230a10aee2 100755
--- a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
@@ -132,12 +132,12 @@ def ProcessData():
     try:
         f_in = open(args.ctm_edits_in)
     except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
+        sys.exit("taint_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
         f_out = open(args.ctm_edits_out, 'w')
     except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
+        sys.exit("taint_ctm_edits.py: error opening ctm-edits output "
                  "file {0}".format(args.ctm_edits_out))
     num_lines_processed = 0
 
@@ -147,10 +147,10 @@ def ProcessData():
     # and then printing the modified lines.
     first_line = f_in.readline()
     if first_line == '':
-        sys.exit("modify_ctm_edits.py: empty input")
+        sys.exit("taint_ctm_edits.py: empty input")
     split_pending_line = first_line.split()
     if len(split_pending_line) == 0:
-        sys.exit("modify_ctm_edits.py: bad input line " + first_line)
+        sys.exit("taint_ctm_edits.py: bad input line " + first_line)
     cur_utterance = split_pending_line[0]
     split_lines_of_cur_utterance = []
 
@@ -170,7 +170,7 @@ def ProcessData():
         split_pending_line = next_line.split()
         if len(split_pending_line) == 0:
             if next_line != '':
-                sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
+                sys.exit("taint_ctm_edits.py: got an empty or whitespace input line")
     try:
         f_out.close()
     except:
@@ -181,13 +181,13 @@ def PrintNonScoredStats():
     if args.verbose < 1:
         return
     if num_lines == 0:
-        print("modify_ctm_edits.py: processed no input.", file = sys.stderr)
+        print("taint_ctm_edits.py: processed no input.", file = sys.stderr)
     num_lines_modified = sum(ref_change_stats.values())
     num_incorrect_lines = num_lines - num_correct_lines
     percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
     percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
     percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines)
-    print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
+    print("taint_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
           "of which {2} were changed fixing the reference for non-scored words "
           "({3}% of lines, or {4}% of incorrect lines)".format(
             num_lines, percent_lines_incorrect, num_lines_modified,
@@ -198,7 +198,7 @@ def PrintNonScoredStats():
                   key = lambda x: ref_change_stats[x])
     num_keys_to_print = 40 if args.verbose >= 2 else 10
 
-    print("modify_ctm_edits.py: most common edits (as percentages "
+    print("taint_ctm_edits.py: most common edits (as percentages "
           "of all such edits) are:\n" +
           ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified)
                      for k in keys[0:num_keys_to_print]]))
diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
index 80b4739a629..dc8cd7d3deb 100755
--- a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
+++ b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
@@ -192,7 +192,7 @@ if [ $stage -le 5 ]; then
 
   $cmd $dir/log/get_ctm_edits.log \
     align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:-  \| \
-      steps/cleanup/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
+      steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
        /dev/stdin $dir/ctm $dir/ctm_edits || exit 1
 
   echo "$0: ctm with edits information appended is in $dir/ctm_edits"
diff --git a/egs/wsj/s5/steps/cleanup/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/make_one_biased_lm.py
deleted file mode 100755
index 2e0bc1fcda1..00000000000
--- a/egs/wsj/s5/steps/cleanup/make_one_biased_lm.py
+++ /dev/null
@@ -1,310 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
-import sys
-import argparse
-import math
-from collections import defaultdict
-
-parser = argparse.ArgumentParser(description="""
-This script creates a biased language model suitable for alignment and
-data-cleanup purposes.   It reads (possibly multiple) lines of integerized text
-from the input and writes a text-form FST of a backoff language model to
-the standard output, to be piped into fstcompile.""")
-
-parser.add_argument("--word-disambig-symbol", type = int, required = True,
-                    help = "Integer corresponding to the disambiguation "
-                    "symbol (normally #0) for backoff arcs")
-parser.add_argument("--ngram-order", type = int, default = 4,
-                    choices = [2,3,4,5,6,7],
-                    help = "Maximum order of n-gram to use (but see also "
-                    "--min-lm-state-count; the effective order may be less.")
-parser.add_argument("--min-lm-state-count", type = int, default = 10,
-                    help = "Minimum count below which we will completely "
-                    "discount an LM-state (if it is of order > 2, i.e. "
-                    "history-length > 1).")
-parser.add_argument("--top-words", type = str,
-                    help = "File containing frequent words and probabilities to be added into "
-                    "the language model, with lines in the format '<integer-id-of-word> <prob>'. "
-                    "These probabilities will be added to the probabilities in the unigram "
-                    "backoff state and then renormalized; this option allows you to introduce "
-                    "common words to the LM with specified probabilities.")
-parser.add_argument("--discounting-constant", type = float, default = 0.3,
-                    help = "Discounting constant D for standard (unmodified) Kneser-Ney; "
-                    "must be strictly between 0 and 1.  A value closer to 0 will give "
-                    "you a more-strongly-biased LM.")
-parser.add_argument("--verbose", type = int, default = 0,
-                    choices=[0,1,2,3,4,5], help = "Verbose level")
-
-args = parser.parse_args()
-
-if args.verbose >= 1:
-    print(' '.join(sys.argv), file = sys.stderr)
-
-
-
-
-class NgramCounts:
-    ## A note on data-structure.
-    ## Firstly, all words are represented as integers.
-    ## We store n-gram counts as an array, indexed by (history-length == n-gram order minus one)
-    ## (note: python calls arrays "lists")  of dicts from histories to counts, where
-    ## histories are arrays of integers and "counts" are dicts from integer to float.
-    ## For instance, when accumulating the 4-gram count for the 'd' in the sequence '5 6 7 8',
-    ## we'd do as follows:
-    ##  self.counts[3][[5,6,7]][8] += 1.0
-    ## where the [3] indexes an array, the [[5,6,7]] indexes a dict, and
-    ## the [8] indexes a dict.
-    def __init__(self, ngram_order):
-        self.ngram_order = ngram_order
-        # Integerized counts will never contain negative numbers, so
-        # inside this program, we use -1 and -2 for the BOS and EOS symbols
-        # respectively.
-        # Note: it's actually important that the bos-symbol is the most negative;
-        # it helps ensure that we print the state with left-context <s> first
-        # when we print the FST, and this means that the start-state will have
-        # the correct value.
-        self.bos_symbol = -3
-        self.eos_symbol = -2
-        # backoff_symbol is kind of a pseudo-word, it's used in keeping track of
-        # the backoff counts in each state.
-        self.backoff_symbol = -1
-        self.counts = []
-        for n in range(ngram_order):
-            # The 'lambda: defaultdict(float)' is an anonymous function taking
-            # no arguments that returns a new defaultdict(float).
-            # If we index self.counts[n][history] for a history-length n < ngram_order
-            # and a previously unseen history, it will create a new defaultdict
-            # that defaults to 0.0 [since the function float() will return 0.0].
-            # This means that we can index self.counts without worrying about
-            # undefined values.
-            self.counts.append(defaultdict(lambda: defaultdict(float)))
-
-    # adds a raw count (called while processing input data).
-    # Suppose we see the sequence '6 7 8 9' and ngram_order=4, 'history'
-    # would be (6,7,8) and 'predicted_word' would be 9; 'count' would be
-    # 1.0.
-    def AddCount(self, history, predicted_word, count):
-        self.counts[len(history)][history][predicted_word] += count
-
-    # 'line' is a string containing a sequence of integer word-ids.
-    # This function adds the un-smoothed counts from this line of text.
-    def AddRawCountsFromLine(self, line):
-        try:
-            words = [self.bos_symbol] + [ int(x) for x in line.split() ] + [self.eos_symbol]
-        except:
-            sys.exit("make_one_biased_lm.py: bad input line {0} (expected a sequence "
-                     "of integers)".format(line))
-
-        for n in range(1, len(words)):
-            predicted_word = words[n]
-            history_start = max(0, n + 1 - self.ngram_order)
-            history = tuple(words[history_start:n])
-            self.AddCount(history, predicted_word, 1.0)
-
-    def AddRawCountsFromStandardInput(self):
-        lines_processed = 0
-        while True:
-            line = sys.stdin.readline()
-            if line == '':
-                break
-            self.AddRawCountsFromLine(line)
-            lines_processed += 1
-        if lines_processed == 0 or args.verbose > 0:
-            print("make_one_biased_lm.py: processed {0} lines of input".format(
-                    lines_processed), file = sys.stderr)
-
-
-    # This function returns a dict from history (as a tuple of integers of
-    # length > 1, ignoring lower-order histories), to the total count of this
-    # history state plus all history-states which back off to this history state.
-    # It's used inside CompletelyDiscountLowCountStates().
-    def GetHistToTotalCount(self):
-        ans = defaultdict(float)
-        for n in range(2, self.ngram_order):
-            for hist, word_to_count in self.counts[n].items():
-                total_count = sum(word_to_count.values())
-                while len(hist) >= 2:
-                    ans[hist] += total_count
-                    hist = hist[1:]
-        return ans
-
-
-    # This function will completely discount the counts in any LM-states of
-    # order > 2 (i.e. history-length > 1) that have total count below
-    # 'min_count'; when computing the total counts, we include higher-order
-    # LM-states that would back off to 'this' lm-state, in the total.
-    def CompletelyDiscountLowCountStates(self, min_count):
-        hist_to_total_count = self.GetHistToTotalCount()
-        for n in reversed(range(2, self.ngram_order)):
-            this_order_counts = self.counts[n]
-            for hist in this_order_counts.keys():
-                if hist_to_total_count[hist] < min_count:
-                    # we need to completely back off this count.
-                    word_to_count = this_order_counts[hist]
-                    del this_order_counts[hist] # delete the key from the dict.
-                    backoff_hist = hist[1:]  # this will be a tuple not a list.
-                    for word, count in word_to_count.items():
-                        self.AddCount(backoff_hist, word, count)
-
-
-
-    # This backs off the counts according to Kneser-Ney (unmodified,
-    # with interpolation).
-    def ApplyBackoff(self, D):
-        assert D > 0.0 and D < 1.0
-        for n in reversed(range(1, self.ngram_order)):
-            this_order_counts = self.counts[n]
-            for hist, word_to_count in this_order_counts.items():
-                backoff_hist = hist[1:]
-                backoff_word_to_count = self.counts[n-1][backoff_hist]
-                this_discount_total = 0.0
-                for word in word_to_count:
-                    assert word_to_count[word] >= 1.0
-                    word_to_count[word] -= D
-                    this_discount_total += D
-                    # Interpret the following line as incrementing the
-                    # count-of-counts for the next-lower order.
-                    backoff_word_to_count[word] += 1.0
-                word_to_count[self.backoff_symbol] += this_discount_total
-
-
-    # This function prints out to stderr the n-gram counts stored in this
-    # object; it's used for debugging.
-    def Print(self, info_string):
-        print(info_string, file=sys.stderr)
-        # these are useful for debug.
-        total = 0.0
-        total_excluding_backoff = 0.0
-        for this_order_counts in self.counts:
-            for hist, word_to_count in this_order_counts.items():
-                this_total_count = sum(word_to_count.values())
-                print(str(hist) + ': total={0} '.format(this_total_count),
-                      end='', file=sys.stderr)
-                print(' '.join(['{0} -> {1} '.format(word, count)
-                                for word, count in word_to_count.items() ]),
-                      file = sys.stderr)
-                total += this_total_count
-                total_excluding_backoff += this_total_count
-                if self.backoff_symbol in word_to_count:
-                    total_excluding_backoff -= word_to_count[self.backoff_symbol]
-        print('total count = {0}, excluding discount = {1}'.format(
-                total, total_excluding_backoff), file = sys.stderr)
-
-    def AddTopWords(self, top_words_file):
-        empty_history = ()
-        word_to_count = self.counts[0][empty_history]
-        total = sum(word_to_count.values())
-        try:
-            f = open(top_words_file)
-        except:
-            sys.exit("make_one_biased_lm.py: error opening top-words file: "
-                     "--top-words=" + top_words_file)
-        while True:
-            line = f.readline()
-            if line == '':
-                break
-            try:
-                [ word_index, prob ] = line.split()
-                word_index = int(word_index)
-                prob = float(prob)
-                assert word_index > 0 and prob > 0.0
-                word_to_count[word_index] += prob * total
-            except Exception as e:
-                sys.exit("make_one_biased_lm.py: could not make sense of the "
-                         "line '{0}' in op-words file: {1} ".format(line, str(e)))
-        f.close()
-
-
-    def GetTotalCountMap(self):
-        # This function, called from PrintAsFst, returns a map from
-        # history to the total-count for that state.
-        total_count_map = dict()
-        for n in range(0, self.ngram_order):
-            for hist, word_to_count in self.counts[n].items():
-                total_count_map[hist] = sum(word_to_count.values())
-        return total_count_map
-
-    def GetHistToStateMap(self):
-        # This function, called from PrintAsFst, returns a map from
-        # history to integer FST-state.
-        hist_to_state = dict()
-        fst_state_counter = 0
-        for n in range(0, self.ngram_order):
-            for hist in self.counts[n].keys():
-                hist_to_state[hist] = fst_state_counter
-                fst_state_counter += 1
-        return hist_to_state
-
-    def GetProb(self, hist, word, total_count_map):
-        total_count = total_count_map[hist]
-        word_to_count = self.counts[len(hist)][hist]
-        prob = word_to_count[word] / total_count
-        if len(hist) > 0 and word != self.backoff_symbol:
-            prob_in_backoff = self.GetProb(hist[1:], word, total_count_map)
-            backoff_prob = word_to_count[self.backoff_symbol] / total_count
-            prob += backoff_prob * prob_in_backoff
-        return prob
-
-    # This function prints the estimated language model as an FST.
-    def PrintAsFst(self, word_disambig_symbol):
-        # n is the history-length (== order + 1).  We iterate over the
-        # history-length in the order 1, 0, 2, 3, and then iterate over the
-        # histories of each order in sorted order.  Putting order 1 first
-        # and sorting on the histories
-        # ensures that the bigram state with <s> as the left context comes first.
-        # (note: self.bos_symbol is the most negative symbol)
-
-        # History will map from history (as a tuple) to integer FST-state.
-        hist_to_state = self.GetHistToStateMap()
-        total_count_map = self.GetTotalCountMap()
-
-        for n in [ 1, 0 ] + range(2, self.ngram_order):
-            this_order_counts = self.counts[n]
-            # For order 1, make sure the keys are sorted.
-            keys = this_order_counts.keys() if n != 1 else sorted(this_order_counts.keys())
-            for hist in keys:
-                word_to_count = this_order_counts[hist]
-                this_fst_state = hist_to_state[hist]
-
-                for word in word_to_count.keys():
-                    # work out this_cost.  Costs in OpenFst are negative logs.
-                    this_cost = -math.log(self.GetProb(hist, word, total_count_map))
-
-                    if word > 0: # a real word.
-                        next_hist = hist + (word,)  # appending tuples
-                        while not next_hist in hist_to_state:
-                            next_hist = next_hist[1:]
-                        next_fst_state = hist_to_state[next_hist]
-                        print(this_fst_state, next_fst_state, word, word,
-                              this_cost)
-                    elif word == self.eos_symbol:
-                        # print final-prob for this state.
-                        print(this_fst_state, this_cost)
-                    else:
-                        assert word == self.backoff_symbol
-                        backoff_fst_state = hist_to_state[hist[1:len(hist)]]
-                        print(this_fst_state, backoff_fst_state,
-                              word_disambig_symbol, 0, this_cost)
-
-
-ngram_counts = NgramCounts(args.ngram_order)
-ngram_counts.AddRawCountsFromStandardInput()
-
-if args.verbose >= 3:
-    ngram_counts.Print("Raw counts:")
-ngram_counts.CompletelyDiscountLowCountStates(args.min_lm_state_count)
-if args.verbose >= 3:
-    ngram_counts.Print("Counts after discounting low-count states:")
-ngram_counts.ApplyBackoff(args.discounting_constant)
-if args.verbose >= 3:
-    ngram_counts.Print("Counts after applying Kneser-Ney discounting:")
-if args.top_words != None:
-    ngram_counts.AddTopWords(args.top_words)
-    if args.verbose >= 3:
-        ngram_counts.Print("Counts after applying top-n-words")
-ngram_counts.PrintAsFst(args.word_disambig_symbol)
-
-
-# test comand:
-# (echo 6 7 8 4; echo 7 8 9; echo 7 8) | ./make_one_biased_lm.py --word-disambig-symbol=1000 --min-lm-state-count=2 --verbose=3 --top-words=<(echo 1 0.5; echo 2 0.25)
diff --git a/egs/wsj/s5/steps/cleanup/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/modify_ctm_edits.py
deleted file mode 100755
index 1022196a456..00000000000
--- a/egs/wsj/s5/steps/cleanup/modify_ctm_edits.py
+++ /dev/null
@@ -1,428 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2016   Vimal Manohar
-#           2016   Johns Hopkins University (author: Daniel Povey)
-# Apache 2.0
-
-from __future__ import print_function
-import sys, operator, argparse, os
-from collections import defaultdict
-
-# This script reads and writes the 'ctm-edits' file that is
-# produced by get_ctm_edits.py.
-
-# It modifies the ctm-edits so that non-scored words
-# are not counted as errors: for instance, if there are things like
-# [COUGH] and [NOISE] in the transcript, deletions, insertions and
-# substitutions involving them are allowed, and we modify the reference
-# to correspond to the hypothesis.
-#
-# If you supply the <lang> directory (the one that corresponds to
-# how you decoded the data) to this script, it assumes that the <lang>
-# directory contains phones/align_lexicon.int, and it uses this to work
-# out a reasonable guess of the non-scored phones, based on which have
-# a single-word pronunciation that maps to a silence phone.
-# It then uses the words.txt to work out the written form of those words.
-#
-# Alternatively, you may specify a file containing the non-scored words one
-# per line, with the --non-scored-words option.
-#
-# Non-scored words that were deleted (i.e. they were in the ref but not the
-# hyp) are simply removed from the ctm.  For non-scored words that
-# were inserted or substituted, we change the reference word to match the
-# hyp word, but instead of marking the operation as 'cor' (correct), we
-# mark it as 'fix' (fixed), so that it will not be positively counted as a correct
-# word for purposes of finding the optimal segment boundaries.
-#
-# e.g.
-# <file-id> <channel> <start-time> <duration> <conf> <hyp-word> <ref-word> <edit-type>
-# [note: the <channel> will always be 1].
-
-# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil
-# AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor
-# AJJacobs_2007P-0001605-0003029 1 0.24 0.25 thought 1.0 thought cor
-# AJJacobs_2007P-0001605-0003029 1 0.49 0.14 i'd 1.0 i'd cor
-# AJJacobs_2007P-0001605-0003029 1 0.63 0.22 tell 1.0 tell cor
-# AJJacobs_2007P-0001605-0003029 1 0.85 0.11 you 1.0 you cor
-# AJJacobs_2007P-0001605-0003029 1 0.96 0.05 a 1.0 a cor
-# AJJacobs_2007P-0001605-0003029 1 1.01 0.24 little 1.0 little cor
-# AJJacobs_2007P-0001605-0003029 1 1.25 0.5 about 1.0 about cor
-# AJJacobs_2007P-0001605-0003029 1 1.75 0.48 [UH] 1.0 [UH] cor
-
-
-parser = argparse.ArgumentParser(
-    description = "This program modifies the reference in the ctm-edits which "
-    "is output by steps/cleanup/get_ctm_edits.py, to allow insertions, deletions and "
-    "substitutions of non-scored words, and [if --allow-repetitions=true], "
-    "duplications of single words or pairs of scored words (to account for dysfluencies "
-    "that were not transcribed).  Note: deletions and substitutions of non-scored words "
-    "after the reference is corrected, will be marked as operation 'fix' rather than "
-    "'cor' (correct) so that the downstream processing knows that this was not in "
-    "the original reference.  Also by defaults tags non-scored words as such when "
-    "they are correct; see the --tag-non-scored option.")
-
-parser.add_argument("--verbose", type = int, default = 1,
-                    choices=[0,1,2,3],
-                    help = "Verbose level, higher = more verbose output")
-parser.add_argument("--allow-repetitions", type = str, default = 'true',
-                    choices=['true','false'],
-                    help = "If true, allow repetitions in the transcript of one or "
-                    "two-word sequences: for instance if the ref says 'i' but "
-                    "the hyp says 'i i', or the ref says 'but then' and the hyp says "
-                    "'but then but then', fix the reference accordingly.  Intervening "
-                    "non-scored words are allowed between the repetitions.  These "
-                    "fixes will be marked as 'cor', not as 'fix', since there is "
-                    "generally no way to tell which repetition was the 'real' one "
-                    "(and since we're generally confident that such things were "
-                    "actually uttered).")
-parser.add_argument("non_scored_words_in", metavar = "<non-scored-words-file>",
-                    help="Filename of file containing a list of non-scored words, "
-                    "one per line. See steps/cleanup/get_nonscored_words.py.")
-parser.add_argument("ctm_edits_in", metavar = "<ctm-edits-in>",
-                    help = "Filename of input ctm-edits file. "
-                    "Use /dev/stdin for standard input.")
-parser.add_argument("ctm_edits_out", metavar = "<ctm-edits-out>",
-                    help = "Filename of output ctm-edits file. "
-                    "Use /dev/stdout for standard output.")
-
-args = parser.parse_args()
-
-
-
-def ReadNonScoredWords(non_scored_words_file):
-    global non_scored_words
-    try:
-        f = open(non_scored_words_file)
-    except:
-        sys.exit("modify_ctm_edits.py: error opening file: "
-                 "--non-scored-words=" + non_scored_words_file)
-    for line in f.readlines():
-        a = line.split()
-        if not len(line.split()) == 1:
-            sys.exit("modify_ctm_edits.py: bad line in non-scored-words "
-                     "file {0}: {1}".format(non_scored_words_file, line))
-        non_scored_words.add(a[0])
-    f.close()
-
-
-
-# The ctm-edits file format is as follows [note: file-id is really utterance-id
-# in this context].
-# <file-id> <channel> <start-time> <duration> <conf> <hyp-word> <ref-word> <edit>
-# e.g.:
-# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil
-# AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor
-# ...
-# This function processes a single line of ctm-edits input for fixing
-# "non-scored" words.  The input 'a' is the split line as an array of fields.
-# It modifies the object 'a'.   This function returns the modified array,
-# and please note that it is destructive of its input 'a'.
-# If it returnso the empty array then the line is to be deleted.
-def ProcessLineForNonScoredWords(a):
-    global num_lines, num_correct_lines, ref_change_stats
-    try:
-        assert len(a) == 8
-        num_lines += 1
-        # we could do:
-        # [ file, channel, start, duration, hyp_word, confidence, ref_word, edit_type ] = a
-        duration = a[3]
-        hyp_word = a[4]
-        ref_word = a[6]
-        edit_type = a[7]
-        if edit_type == 'ins':
-            assert ref_word == '<eps>'
-            if hyp_word in non_scored_words:
-                # insert this non-scored word into the reference.
-                ref_change_stats[ref_word + ' -> ' + hyp_word] += 1
-                ref_word = hyp_word
-                edit_type = 'fix'
-        elif edit_type == 'del':
-            assert hyp_word == '<eps>' and float(duration) == 0.0
-            if ref_word in non_scored_words:
-                ref_change_stats[ref_word + ' -> ' + hyp_word] += 1
-                return []
-        elif edit_type == 'sub':
-            if hyp_word in non_scored_words and ref_word in non_scored_words:
-                # we also allow replacing one non-scored word with another.
-                ref_change_stats[ref_word + ' -> ' + hyp_word] += 1
-                ref_word = hyp_word
-                edit_type = 'fix'
-        else:
-            assert edit_type == 'cor' or edit_type == 'sil'
-            num_correct_lines += 1
-
-        a[4] = hyp_word
-        a[6] = ref_word
-        a[7] = edit_type
-        return a
-
-    except Exception as e:
-        print("modify_ctm_edits.py: bad line in ctm-edits input: " + ' '.join(a),
-              file = sys.stderr)
-        print("modify_ctm_edits.py: exception was: " + str(e),
-              file = sys.stderr)
-        sys.exit(1)
-
-# This function processes the split lines of one utterance (as a
-# list of lists of fields), to allow repetitions of words, so if the
-# reference says 'i' but the hyp says 'i i', or the ref says
-# 'you know' and the hyp says 'you know you know', we change the
-# ref to match.
-# It returns the modified list-of-lists [but note that the input
-# is actually modified].
-def ProcessUtteranceForRepetitions(split_lines_of_utt):
-    global non_scored_words, repetition_stats
-    # The array 'selected_lines' will contain the indexes of of selected
-    # elements of 'split_lines_of_utt'.  Consider split_line =
-    # split_lines_of_utt[i].  If the hyp and ref words in split_line are both
-    # either '<eps>' or non-scoreable words, we discard the index.
-    # Otherwise we put it into selected_lines.
-    selected_line_indexes = []
-    # selected_edits will contain, for each element of selected_line_indexes, the
-    # corresponding edit_type from the original utterance previous to
-    # this function call ('cor', 'ins', etc.).
-    #
-    # As a special case, if there was a substitution ('sub') where the
-    # reference word was a non-scored word and the hyp word was a real word,
-    # we mark it in this array as 'ins', because for purposes of this algorithm
-    # it behaves the same as an insertion.
-    #
-    # Whenever we do any operation that will change the reference, we change
-    # all the selected_edits in the array to None so that they won't match
-    # any further operations.
-    selected_edits = []
-    # selected_hyp_words will contain, for each element of selected_line_indexes, the
-    # corresponding hyp_word.
-    selected_hyp_words = []
-
-    for i in range(len(split_lines_of_utt)):
-        split_line = split_lines_of_utt[i]
-        hyp_word = split_line[4]
-        ref_word = split_line[6]
-        # keep_this_line will be True if we are going to keep this line in the
-        # 'selected lines' for further processing of repetitions.  We only
-        # eliminate lines involving non-scored words or epsilon in both hyp
-        # and reference position
-        # [note: epsilon in hyp position for non-empty segments indicates
-        #  optional-silence, and it does make sense to make this 'invisible',
-        #  just like non-scored words, for the purposes of this code.]
-        keep_this_line = True
-        if (hyp_word == '<eps>' or hyp_word in non_scored_words) and \
-           (ref_word == '<eps>' or ref_word in non_scored_words):
-            keep_this_line = False
-        if keep_this_line:
-            selected_line_indexes.append(i)
-            edit_type = split_line[7]
-            if edit_type == 'sub' and ref_word in non_scored_words:
-                assert not hyp_word in non_scored_words
-                # For purposes of this algorithm, substitution of, say,
-                # '[COUGH]' by 'hello' behaves like an insertion of 'hello',
-                # since we're willing to remove the '[COUGH]' from the
-                # transript.
-                edit_type = 'ins'
-            selected_edits.append(edit_type)
-            selected_hyp_words.append(hyp_word)
-
-    # indexes_to_fix will be a list of indexes into 'selected_indexes' where we
-    # plan to fix the ref to match the hyp.
-    indexes_to_fix = []
-
-    # This loop scans for, and fixes, two-word insertions that follow,
-    # or precede, the corresponding correct words.
-    for i in range(0, len(selected_line_indexes) - 3):
-        this_indexes = selected_line_indexes[i:i+4]
-        this_hyp_words = selected_hyp_words[i:i+4]
-
-        if this_hyp_words[0] == this_hyp_words[2] and \
-           this_hyp_words[1] == this_hyp_words[3] and \
-           this_hyp_words[0] != this_hyp_words[1]:
-            # if the hyp words were of the form [ 'a', 'b', 'a', 'b' ]...
-            this_edits = selected_edits[i:i+4]
-            if this_edits == [ 'cor', 'cor', 'ins', 'ins' ] or \
-                    this_edits == [ 'ins', 'ins', 'cor', 'cor' ]:
-                if this_edits[0] == 'cor':
-                    indexes_to_fix += [ i+2, i+3 ]
-                else:
-                    indexes_to_fix += [ i, i+1 ]
-
-                # the next line prevents this region of the text being used
-                # in any further edits.
-                selected_edits[i:i+4] = [ None, None, None, None ]
-                word_pair = this_hyp_words[0] + ' '  + this_hyp_words[1]
-                # e.g. word_pair = 'hi there'
-                # add 2 because these stats are of words.
-                repetition_stats[word_pair] += 2
-                # the next line prevents this region of the text being used
-                # in any further edits.
-                selected_edits[i:i+4] = [ None, None, None, None ]
-
-    # This loop scans for, and fixes, one-word insertions that follow,
-    # or precede, the corresponding correct words.
-    for i in range(0, len(selected_line_indexes) - 1):
-        this_indexes = selected_line_indexes[i:i+2]
-        this_hyp_words = selected_hyp_words[i:i+2]
-
-        if this_hyp_words[0] == this_hyp_words[1]:
-            # if the hyp words were of the form [ 'a', 'a' ]...
-            this_edits = selected_edits[i:i+2]
-            if this_edits == [ 'cor', 'ins' ] or this_edits == [ 'ins', 'cor' ]:
-                if this_edits[0] == 'cor':
-                    indexes_to_fix.append(i+1)
-                else:
-                    indexes_to_fix.append(i)
-                repetition_stats[this_hyp_words[0]] += 1
-                # the next line prevents this region of the text being used
-                # in any further edits.
-                selected_edits[i:i+2] = [ None, None ]
-
-    for i in indexes_to_fix:
-        j = selected_line_indexes[i]
-        split_line = split_lines_of_utt[j]
-        ref_word = split_line[6]
-        hyp_word = split_line[4]
-        assert ref_word == '<eps>' or ref_word in non_scored_words
-        # we replace reference with the decoded word, which will be a
-        # repetition.
-        split_line[6] = hyp_word
-        split_line[7] = 'cor'
-
-    return split_lines_of_utt
-
-
-# note: split_lines_of_utt is a list of lists, one per line, each containing the
-# sequence of fields.
-# Returns the same format of data after processing.
-def ProcessUtterance(split_lines_of_utt):
-    new_split_lines_of_utt = []
-    for split_line in split_lines_of_utt:
-        new_split_line = ProcessLineForNonScoredWords(split_line)
-        if new_split_line != []:
-            new_split_lines_of_utt.append(new_split_line)
-    if args.allow_repetitions == 'true':
-        new_split_lines_of_utt = ProcessUtteranceForRepetitions(new_split_lines_of_utt)
-    return new_split_lines_of_utt
-
-
-def ProcessData():
-    try:
-        f_in = open(args.ctm_edits_in)
-    except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
-                 "file {0}".format(args.ctm_edits_in))
-    try:
-        f_out = open(args.ctm_edits_out, 'w')
-    except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
-                 "file {0}".format(args.ctm_edits_out))
-    num_lines_processed = 0
-
-
-    # Most of what we're doing in the lines below is splitting the input lines
-    # and grouping them per utterance, before giving them to ProcessUtterance()
-    # and then printing the modified lines.
-    first_line = f_in.readline()
-    if first_line == '':
-        sys.exit("modify_ctm_edits.py: empty input")
-    split_pending_line = first_line.split()
-    if len(split_pending_line) == 0:
-        sys.exit("modify_ctm_edits.py: bad input line " + first_line)
-    cur_utterance = split_pending_line[0]
-    split_lines_of_cur_utterance = []
-
-    while True:
-        if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance:
-            split_lines_of_cur_utterance = ProcessUtterance(split_lines_of_cur_utterance)
-            for split_line in split_lines_of_cur_utterance:
-                print(' '.join(split_line), file = f_out)
-            split_lines_of_cur_utterance = []
-            if len(split_pending_line) == 0:
-                break
-            else:
-                cur_utterance = split_pending_line[0]
-
-        split_lines_of_cur_utterance.append(split_pending_line)
-        next_line = f_in.readline()
-        split_pending_line = next_line.split()
-        if len(split_pending_line) == 0:
-            if next_line != '':
-                sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
-    try:
-        f_out.close()
-    except:
-        sys.exit("modify_ctm_edits.py: error closing ctm-edits output "
-                 "(broken pipe or full disk?)")
-
-def PrintNonScoredStats():
-    if args.verbose < 1:
-        return
-    if num_lines == 0:
-        print("modify_ctm_edits.py: processed no input.", file = sys.stderr)
-    num_lines_modified = sum(ref_change_stats.values())
-    num_incorrect_lines = num_lines - num_correct_lines
-    percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
-    percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
-    percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines)
-    print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
-          "of which {2} were changed fixing the reference for non-scored words "
-          "({3}% of lines, or {4}% of incorrect lines)".format(
-            num_lines, percent_lines_incorrect, num_lines_modified,
-            percent_modified, percent_of_incorrect_modified),
-          file = sys.stderr)
-
-    keys = sorted(ref_change_stats.keys(), reverse=True,
-                  key = lambda x: ref_change_stats[x])
-    num_keys_to_print = 40 if args.verbose >= 2 else 10
-
-    print("modify_ctm_edits.py: most common edits (as percentages "
-          "of all such edits) are:\n" +
-          ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified)
-                     for k in keys[0:num_keys_to_print]]))
-          + '\n...'if num_keys_to_print < len(keys) else '',
-          file = sys.stderr)
-
-
-def PrintRepetitionStats():
-    if args.verbose < 1 or sum(repetition_stats.values()) == 0:
-        return
-    num_lines_modified = sum(repetition_stats.values())
-    num_incorrect_lines = num_lines - num_correct_lines
-    percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
-    percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
-    percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines)
-    print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
-          "of which {2} were changed fixing the reference for repetitions ({3}% of "
-          "lines, or {4}% of incorrect lines)".format(
-            num_lines, percent_lines_incorrect, num_lines_modified,
-            percent_modified, percent_of_incorrect_modified),
-          file = sys.stderr)
-
-    keys = sorted(repetition_stats.keys(), reverse=True,
-                  key = lambda x: repetition_stats[x])
-    num_keys_to_print = 40 if args.verbose >= 2 else 10
-
-    print("modify_ctm_edits.py: most common repetitions inserted into reference (as percentages "
-          "of all words fixed in this way) are:\n" +
-          ('\n'.join([ '%s [%.2f%%]' % (k, repetition_stats[k]*100.0/num_lines_modified)
-                     for k in keys[0:num_keys_to_print]]))
-          + '\n...' if num_keys_to_print < len(keys) else '',
-          file = sys.stderr)
-
-
-non_scored_words = set()
-ReadNonScoredWords(args.non_scored_words_in)
-
-num_lines = 0
-num_correct_lines = 0
-# ref_change_stats will be a map from a string like
-# 'foo -> bar' to an integer count; it keeps track of how much we changed
-# the reference.
-ref_change_stats = defaultdict(int)
-# repetition_stats will be a map from strings like
-# 'a', or 'a b' (the repeated strings), to an integer count; like
-# ref_change_stats, it keeps track of how many changes we made
-# in allowing repetitions.
-repetition_stats = defaultdict(int)
-
-ProcessData()
-PrintNonScoredStats()
-PrintRepetitionStats()
diff --git a/egs/wsj/s5/steps/cleanup/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/segment_ctm_edits.py
deleted file mode 100755
index 7e635d66169..00000000000
--- a/egs/wsj/s5/steps/cleanup/segment_ctm_edits.py
+++ /dev/null
@@ -1,1034 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2016   Vimal Manohar
-#           2016   Johns Hopkins University (author: Daniel Povey)
-# Apache 2.0
-
-from __future__ import print_function
-import sys, operator, argparse, os
-from collections import defaultdict
-
-# This script reads 'ctm-edits' file format that is produced by get_ctm_edits.py
-# and modified by modify_ctm_edits.py and taint_ctm_edits.py Its function is to
-# produce a segmentation and text from the ctm-edits input.
-
-# The ctm-edits file format that this script expects is as follows
-# <file-id> <channel> <start-time> <duration> <conf> <hyp-word> <ref-word> <edit> ['tainted']
-# [note: file-id is really utterance-id at this point].
-
-parser = argparse.ArgumentParser(
-    description = "This program produces segmentation and text information "
-    "based on reading ctm-edits input format which is produced by "
-    "steps/cleanup/get_ctm_edits.py, steps/cleanup/modify_ctm_edits.py and "
-    "steps/cleanup/taint_ctm_edits.py.")
-
-parser.add_argument("--min-segment-length", type = float, default = 0.5,
-                    help = "Minimum allowed segment length (in seconds) for any "
-                    "segment; shorter segments than this will be discarded.")
-parser.add_argument("--min-new-segment-length", type = float, default = 1.0,
-                    help = "Minimum allowed segment length (in seconds) for newly "
-                    "created segments (i.e. not identical to the input utterances)"
-                    "Expected to be >= --min-segment-length.")
-parser.add_argument("--frame-length", type = float, default = 0.01,
-                    help = "This only affects rounding of the output times; they will "
-                    "be constrained to multiples of this value.")
-parser.add_argument("--max-tainted-length", type = float, default = 0.05,
-                    help = "Maximum allowed length of any 'tainted' line.  Note: "
-                    "'tainted' lines may only appear at the boundary of a "
-                    "segment")
-parser.add_argument("--max-edge-silence-length", type = float, default = 0.5,
-                    help = "Maximum allowed length of silence if it appears at the "
-                    "edge of a segment (will be truncated).  This rule is "
-                    "relaxed if such truncation would take a segment below "
-                    "the --min-segment-length or --min-new-segment-length.")
-parser.add_argument("--max-edge-non-scored-length", type = float, default = 0.5,
-                    help = "Maximum allowed length of a non-scored word (noise, cough, etc.) "
-                    "if it appears at the edge of a segment (will be truncated). "
-                    "This rule is relaxed if such truncation would take a "
-                    "segment below the --min-segment-length.")
-parser.add_argument("--max-internal-silence-length", type = float, default = 2.0,
-                    help = "Maximum allowed length of silence if it appears inside a segment "
-                    "(will cause the segment to be split).")
-parser.add_argument("--max-internal-non-scored-length", type = float, default = 2.0,
-                    help = "Maximum allowed length of a non-scored word (noise, etc.) if "
-                    "it appears inside a segment (will cause the segment to be "
-                    "split).  Note: reference words which are real words but OOV "
-                    "are not included in this category.")
-parser.add_argument("--unk-padding", type = float, default = 0.05,
-                    help = "Amount of padding with <unk> that we do if a segment boundary is "
-                    "next to errors (ins, del, sub).  That is, we add this amount of "
-                    "time to the segment and add the <unk> word to cover the acoustics. "
-                    "If nonzero, the --oov-symbol-file option must be supplied.")
-parser.add_argument("--max-junk-proportion", type = float, default = 0.1,
-                    help = "Maximum proportion of the time of the segment that may "
-                    "consist of potentially bad data, in which we include 'tainted' lines of "
-                    "the ctm-edits input and unk-padding.")
-parser.add_argument("--max-deleted-words-kept-when-merging", type = str, default = 1,
-                    help = "When merging segments that are found to be overlapping or "
-                    "adjacent after all other processing, keep in the transcript the "
-                    "reference words that were deleted between the segments [if any] "
-                    "as long as there were no more than this many reference words. "
-                    "Setting this to zero will mean that any reference words that "
-                    "were deleted between the segments we're about to reattach will "
-                    "not appear in the generated transcript (so we'll match the hyp).")
-parser.add_argument("--oov-symbol-file", type = str, default = None,
-                    help = "Filename of file such as data/lang/oov.txt which contains "
-                    "the text form of the OOV word, normally '<unk>'.  Supplied as "
-                    "a file to avoid complications with escaping.  Necessary if "
-                    "the --unk-padding option has a nonzero value (which it does "
-                    "by default.")
-parser.add_argument("--ctm-edits-out", type = str,
-                    help = "Filename to output an extended version of the ctm-edits format "
-                    "with segment start and end points noted.  This file is intended to be "
-                    "read by humans; there are currently no scripts that will read it.")
-parser.add_argument("--word-stats-out", type = str,
-                    help = "Filename for output of word-level stats, of the form "
-                    "'<word> <bad-proportion> <total-count-in-ref>', e.g. 'hello 0.12 12408', "
-                    "where the <bad-proportion> is the proportion of the time that this "
-                    "reference word does not make it into a segment.  It can help reveal words "
-                    "that have problematic pronunciations or are associated with "
-                    "transcription errors.")
-
-
-parser.add_argument("non_scored_words_in", metavar = "<non-scored-words-file>",
-                    help="Filename of file containing a list of non-scored words, "
-                    "one per line. See steps/cleanup/get_nonscored_words.py.")
-parser.add_argument("ctm_edits_in", metavar = "<ctm-edits-in>",
-                    help = "Filename of input ctm-edits file. "
-                    "Use /dev/stdin for standard input.")
-parser.add_argument("text_out", metavar = "<text-out>",
-                    help = "Filename of output text file (same format as data/train/text, i.e. "
-                    "<new-utterance-id> <word1> <word2> ... <wordN>")
-parser.add_argument("segments_out", metavar = "<segments-out>",
-                    help = "Filename of output segments.  This has the same format as data/train/segments, "
-                    "but instead of <recording-id>, the second field is the old utterance-id, i.e "
-                    "<new-utterance-id> <old-utterance-id> <start-time> <end-time>")
-
-args = parser.parse_args()
-
-
-
-
-def IsTainted(split_line_of_utt):
-    return len(split_line_of_utt) > 8 and split_line_of_utt[8] == 'tainted'
-
-# This function returns a list of pairs (start-index, end-index) representing
-# the cores of segments (so if a pair is (s, e), then the core of a segment
-# would span (s, s+1, ... e-1).
-#
-# By the 'core of a segment', we mean a sequence of ctm-edits lines including at
-# least one 'cor' line and a contiguous sequence of other lines of the type
-# 'cor', 'fix' and 'sil' that must be not tainted.  The segment core excludes
-# any tainted lines at the edge of a segment, which will be added later.
-#
-# We only initiate segments when it contains something correct and not realized
-# as unk (i.e. ref==hyp); and we extend it with anything that is 'sil' or 'fix'
-# or 'cor' that is not tainted.  Contiguous regions of 'true' in the resulting
-# boolean array will then become the cores of prototype segments, and we'll add
-# any adjacent tainted words (or parts of them).
-def ComputeSegmentCores(split_lines_of_utt):
-    num_lines = len(split_lines_of_utt)
-    line_is_in_segment_core = [ False] * num_lines
-    for i in range(num_lines):
-        if split_lines_of_utt[i][7] == 'cor' and \
-            split_lines_of_utt[i][4] == split_lines_of_utt[i][6]:
-            line_is_in_segment_core[i] = True
-
-    # extend each proto-segment forwards as far as we can:
-    for i in range(1, num_lines):
-        if line_is_in_segment_core[i-1] and not line_is_in_segment_core[i]:
-            edit_type = split_lines_of_utt[i][7]
-            if not IsTainted(split_lines_of_utt[i]) and \
-                (edit_type == 'cor' or edit_type == 'sil' or edit_type == 'fix'):
-                line_is_in_segment_core[i] = True
-
-    # extend each proto-segment backwards as far as we can:
-    for i in reversed(range(0, num_lines - 1)):
-        if line_is_in_segment_core[i+1] and not line_is_in_segment_core[i]:
-            edit_type = split_lines_of_utt[i][7]
-            if not IsTainted(split_lines_of_utt[i]) and \
-               (edit_type == 'cor' or edit_type == 'sil' or edit_type == 'fix'):
-                line_is_in_segment_core[i] = True
-
-
-    segment_ranges = []
-    cur_segment_start = None
-    for i in range(0, num_lines):
-        if line_is_in_segment_core[i]:
-            if cur_segment_start == None:
-                cur_segment_start = i
-        else:
-            if cur_segment_start != None:
-                segment_ranges.append( (cur_segment_start, i) )
-                cur_segment_start = None
-    if cur_segment_start != None:
-        segment_ranges.append( (cur_segment_start, num_lines) )
-
-    return segment_ranges
-
-class Segment:
-    def __init__(self, split_lines_of_utt, start_index, end_index, debug_str = None):
-        self.split_lines_of_utt = split_lines_of_utt
-        # start_index is the index of the first line that appears in this
-        # segment, and end_index is one past the last line.  This does not
-        # include unk-padding.
-        self.start_index = start_index
-        self.end_index = end_index
-        # If the following values are nonzero, then when we create the segment
-        # we will add <unk> at the start and end of the segment [representing
-        # partial words], with this amount of additional audio.
-        self.start_unk_padding = 0.0
-        self.end_unk_padding = 0.0
-
-        # debug_str keeps track of the 'core' of the segment.
-        if debug_str == None:
-            debug_str = 'core-start={0},core-end={1}'.format(start_index,end_index)
-        self.debug_str = debug_str
-
-        # This gives the proportion of the time of the first line in the segment
-        # that we keep.  Usually 1.0 but may be less if we've trimmed away some
-        # proportion of the time.
-        self.start_keep_proportion = 1.0
-        # This gives the proportion of the time of the last line in the segment
-        # that we keep.  Usually 1.0 but may be less if we've trimmed away some
-        # proportion of the time.
-        self.end_keep_proportion = 1.0
-
-    # This is stage 1 of segment processing (after creating the boundaries of the
-    # core of the segment, which is done outside of this class).a
-    #
-    # This function may reduce start_index and/or increase end_index by
-    # including a single adjacent 'tainted' line from the ctm-edits file.  This
-    # is only done if the lines at the boundaries of the segment are currently
-    # real non-silence words and not non-scored words.  The idea is that we
-    # probably don't want to start or end the segment right at the boundary of a
-    # real word, we want to add some kind of padding.
-    def PossiblyAddTaintedLines(self):
-        global non_scored_words
-        split_lines_of_utt = self.split_lines_of_utt
-        # we're iterating over the segment (start, end)
-        for b in [False, True]:
-            if b:
-                boundary_index = self.end_index - 1
-                adjacent_index = self.end_index
-            else:
-                boundary_index = self.start_index
-                adjacent_index = self.start_index - 1
-            if adjacent_index >= 0 and adjacent_index < len(split_lines_of_utt):
-                # only consider merging the adjacent word into the segment if we're not
-                # at a segment boundary.
-                adjacent_line_is_tainted = IsTainted(split_lines_of_utt[adjacent_index])
-                # if the adjacent line wasn't tainted, then there must have been
-                # another stronger reason why we didn't include it in the core
-                # of the segment (probably that it was an ins, del or sub), so
-                # there is no point considering it.
-                if adjacent_line_is_tainted:
-                    boundary_edit_type = split_lines_of_utt[boundary_index][7]
-                    boundary_hyp_word = split_lines_of_utt[boundary_index][7]
-                    # we only add the tainted line to the segment if the word at
-                    # the boundary was a non-silence word that was correctly
-                    # decoded and not fixed [see modify_ctm_edits.py.]
-                    if boundary_edit_type == 'cor' and \
-                       not boundary_hyp_word in non_scored_words:
-                        # Add the adjacent tainted line to the segment.
-                        if b:
-                            self.end_index += 1
-                        else:
-                            self.start_index -= 1
-
-    # This is stage 2 of segment processing.
-    # This function will split a segment into multiple pieces if any of the
-    # internal [non-boundary] silences or non-scored words are longer
-    # than the allowed values --max-internal-silence-length and
-    # --max-internal-non-scored-length.  This function returns a
-    # list of segments.  In the normal case (where there is no splitting)
-    # it just returns an array with a single element 'self'.
-    def PossiblySplitSegment(self):
-        global non_scored_words, args
-        # make sure the segment hasn't been processed more than we expect.
-        assert self.start_unk_padding == 0.0 and self.end_unk_padding == 0.0 and \
-              self.start_keep_proportion == 1.0 and self.end_keep_proportion == 1.0
-        segments = []  # the answer
-        cur_start_index = self.start_index
-        cur_start_is_split = False
-        # only consider splitting at non-boundary lines.  [we'd just truncate
-        # the boundary lines.]
-        for index_to_split_at in range(cur_start_index + 1, self.end_index - 1):
-            this_split_line = self.split_lines_of_utt[index_to_split_at]
-            this_duration = float(this_split_line[3])
-            this_edit_type = this_split_line[7]
-            this_ref_word = this_split_line[6]
-            if (this_edit_type == 'sil' and this_duration > args.max_internal_silence_length) or \
-               (this_ref_word in non_scored_words and this_duration > args.max_internal_non_scored_length):
-                # We split this segment at this index, dividing the word in two
-                # [later on, in PossiblyTruncateBoundaries, it may be further
-                # truncated.]
-                # Note: we use 'index_to_split_at + 1' because the Segment constructor
-                # takes an 'end-index' which is interpreted as one past the end.
-                new_segment = Segment(self.split_lines_of_utt, cur_start_index,
-                                      index_to_split_at + 1, self.debug_str)
-                if cur_start_is_split:
-                    new_segment.start_keep_proportion = 0.5
-                new_segment.end_keep_proportion = 0.5
-                cur_start_is_split = True
-                cur_start_index = index_to_split_at
-                segments.append(new_segment)
-        if len(segments) == 0:  # We did not split.
-            segments.append(self)
-        else:
-            # We did split.  Add the very last segment.
-            new_segment = Segment(self.split_lines_of_utt, cur_start_index,
-                                  self.end_index, self.debug_str)
-            assert cur_start_is_split
-            new_segment.start_keep_proportion = 0.5
-            segments.append(new_segment)
-        return segments
-
-
-    # This is stage 3 of segment processing.  It will truncate the silences and
-    # non-scored words at the segment boundaries if they are longer than the
-    # --max-edge-silence-length and --max-edge-non-scored-length respectively
-    # (and to the extent that this wouldn't take us below the
-    # --min-segment-length or --min-new-segment-length).
-    def PossiblyTruncateBoundaries(self):
-        for b in [True, False]:
-            if b:
-                this_index = self.start_index
-            else:
-                this_index = self.end_index - 1
-            this_split_line = self.split_lines_of_utt[this_index]
-            truncated_duration = None
-            this_duration = float(this_split_line[3])
-            this_edit = this_split_line[7]
-            this_ref_word = this_split_line[6]
-            if this_edit == 'sil' and \
-               this_duration > args.max_edge_silence_length:
-                truncated_duration = args.max_edge_silence_length
-            elif this_ref_word in non_scored_words and \
-                 this_duration > args.max_edge_non_scored_length:
-                truncated_duration = args.max_edge_non_scored_length
-            if truncated_duration != None:
-                keep_proportion = truncated_duration / this_duration
-                if b:
-                    self.start_keep_proportion = keep_proportion
-                else:
-                    self.end_keep_proportion = keep_proportion
-
-    # This relaxes the segment-boundary truncation of
-    # PossiblyTruncateBoundaries(), if it would take us below
-    # min-new-segment-length or min-segment-length.  Note: this does not relax
-    # the boundary truncation for a particular boundary (start or end) if that
-    # boundary corresponds to a 'tainted' line of the ctm (because it's
-    # dangerous to include too much 'tainted' audio).
-    def RelaxBoundaryTruncation(self):
-        # this should be called before adding unk padding.
-        assert self.start_unk_padding == self.end_unk_padding == 0.0
-        if self.start_keep_proportion == self.end_keep_proportion == 1.0:
-            return  # nothing to do there was no truncation.
-        length_cutoff = max(args.min_new_segment_length, args.min_segment_length)
-        length_with_truncation = self.Length()
-        if length_with_truncation >= length_cutoff:
-            return  # Nothing to do.
-        orig_start_keep_proportion = self.start_keep_proportion
-        orig_end_keep_proportion = self.end_keep_proportion
-        if not IsTainted(self.split_lines_of_utt[self.start_index]):
-            self.start_keep_proportion = 1.0
-        if not IsTainted(self.split_lines_of_utt[self.end_index - 1]):
-            self.end_keep_proportion = 1.0
-        length_with_relaxed_boundaries = self.Length()
-        if length_with_relaxed_boundaries <= length_cutoff:
-            # Completely undo the truncation [to the extent allowed by the
-            # presence of tainted lines at the start/end] if, even without
-            # truncation, we'd be below the length cutoff.  This segment may be
-            # removed later on (but it may not, if removing truncation makes us
-            # identical to the input utterance, and the length is between
-            # min_segment_length min_new_segment_length).
-            return
-        # Next, compute an interpolation constant a such that the
-        # {start,end}_keep_proportion values will equal a *
-        # [values-computed-by-PossiblyTruncateBoundaries()] + (1-a) * [completely-relaxed-values].
-        # we're solving the equation:
-        # length_cutoff = a * length_with_truncation + (1-a) * length_with_relaxed_boundaries
-        # -> length_cutoff - length_with_relaxed_boundaries =
-        #        a * (length_with_truncation - length_with_relaxed_boundaries)
-        # -> a = (length_cutoff - length_with_relaxed_boundaries) / (length_with_truncation - length_with_relaxed_boundaries)
-        a = (length_cutoff - length_with_relaxed_boundaries) / \
-            (length_with_truncation - length_with_relaxed_boundaries)
-        if a < 0.0 or a > 1.0:
-            print("segment_ctm_edits.py: bad 'a' value = {0}".format(a), file = sys.stderr)
-            return
-        self.start_keep_proportion = \
-           a * orig_start_keep_proportion + (1-a) * self.start_keep_proportion
-        self.end_keep_proportion = \
-           a * orig_end_keep_proportion + (1-a) * self.end_keep_proportion
-        if not abs(self.Length() - length_cutoff) < 0.01:
-            print("segment_ctm_edits.py: possible problem relaxing boundary "
-                  "truncation, length is {0} vs {1}".format(self.Length(), length_cutoff),
-                  file = sys.stderr)
-
-
-    # This is stage 4 of segment processing.
-    # This function may set start_unk_padding and end_unk_padding to nonzero
-    # values.  This is done if the current boundary words are real, scored
-    # words and we're not next to the beginning or end of the utterance.
-    def PossiblyAddUnkPadding(self):
-        for b in [True, False]:
-            if b:
-                this_index = self.start_index
-            else:
-                this_index = self.end_index - 1
-            this_split_line = self.split_lines_of_utt[this_index]
-            this_start_time = float(this_split_line[2])
-            this_ref_word = this_split_line[6]
-            this_edit = this_split_line[7]
-            if this_edit == 'cor' and not this_ref_word in non_scored_words:
-                # we can consider adding unk-padding.
-                if b: # start of utterance.
-                    unk_padding = args.unk_padding
-                    if unk_padding > this_start_time:  # close to beginning of file
-                        unk_padding = this_start_time
-                    # If we could add less than half of the specified
-                    # unk-padding, don't add any (because when we add
-                    # unk-padding we add the unknown-word symbol '<unk>', and if
-                    # there isn't enough space to traverse the HMM we don't want
-                    # to do it at all.
-                    if unk_padding < 0.5 * args.unk_padding:
-                        unk_padding = 0.0
-                    self.start_unk_padding = unk_padding
-                else: # end of utterance.
-                    this_end_time = this_start_time + float(this_split_line[3])
-                    last_line = self.split_lines_of_utt[-1]
-                    utterance_end_time = float(last_line[2]) + float(last_line[3])
-                    max_allowable_padding = utterance_end_time - this_end_time
-                    assert max_allowable_padding > -0.01
-                    unk_padding = args.unk_padding
-                    if unk_padding > max_allowable_padding:
-                        unk_padding = max_allowable_padding
-                    # If we could add less than half of the specified
-                    # unk-padding, don't add any (because when we add
-                    # unk-padding we add the unknown-word symbol '<unk>', and if
-                    # there isn't enough space to traverse the HMM we don't want
-                    # to do it at all.
-                    if unk_padding < 0.5 * args.unk_padding:
-                        unk_padding = 0.0
-                    self.end_unk_padding = unk_padding
-
-    # This function will merge the segment in 'other' with the segment
-    # in 'self'.  It is only to be called when 'self' and 'other' are from
-    # the same utterance, 'other' is after 'self' in time order (based on
-    # the original segment cores), and self.EndTime() >= other.StartTime().
-    # Note: in this situation there will normally be deleted words
-    # between the two segments.  What this program does with the deleted
-    # words depends on '--max-deleted-words-kept-when-merging'.  If there
-    # were any inserted words in the transcript (less likely), this
-    # program will keep the reference.
-    def MergeWithSegment(self, other):
-        assert self.EndTime() >= other.StartTime() and \
-               self.StartTime() < other.EndTime() and \
-               self.split_lines_of_utt is other.split_lines_of_utt
-        orig_self_end_index = self.end_index
-        self.debug_str = "({0}/merged-with/{1})".format(self.debug_str, other.debug_str)
-        # everything that relates to the end of this segment gets copied
-        # from 'other'.
-        self.end_index = other.end_index
-        self.end_unk_padding = other.end_unk_padding
-        self.end_keep_proportion = other.end_keep_proportion
-        # The next thing we have to do is to go over any lines of the ctm that
-        # appear between 'self' and 'other', or are shared between both (this
-        # would only happen for tainted silence or non-scored-word segments),
-        # and decide what to do with them.  We'll keep the reference for any
-        # substitutions or insertions (which anyway are unlikely to appear
-        # in these merged segments).  Note: most of this happens in self.Text(),
-        # but at this point we need to decide whether to mark any deletions
-        # as 'discard-this-word'.
-        first_index_of_overlap = min(orig_self_end_index - 1, other.start_index)
-        last_index_of_overlap = max(orig_self_end_index - 1, other.start_index)
-        num_deleted_words = 0
-        for i in range(first_index_of_overlap, last_index_of_overlap + 1):
-            edit_type = self.split_lines_of_utt[i][7]
-            if edit_type == 'del':
-                num_deleted_words += 1
-        if num_deleted_words > args.max_deleted_words_kept_when_merging:
-            for i in range(first_index_of_overlap, last_index_of_overlap + 1):
-                if self.split_lines_of_utt[i][7] == 'del':
-                    self.split_lines_of_utt[i].append('do-not-include-in-text')
-
-    # Returns the start time of the utterance (within the enclosing utterance)
-    # This is before any rounding.
-    def StartTime(self):
-        first_line = self.split_lines_of_utt[self.start_index]
-        first_line_start = float(first_line[2])
-        first_line_duration = float(first_line[3])
-        first_line_end = first_line_start + first_line_duration
-        return first_line_end - self.start_unk_padding \
-              - (first_line_duration * self.start_keep_proportion)
-
-    # Returns some string-valued information about 'this' that is useful for debugging.
-    def DebugInfo(self):
-        return 'start=%d,end=%d,unk-padding=%.2f,%.2f,keep-proportion=%.2f,%.2f,' % \
-            (self.start_index, self.end_index, self.start_unk_padding,
-             self.end_unk_padding, self.start_keep_proportion, self.end_keep_proportion) + \
-         self.debug_str
-
-    # Returns the start time of the utterance (within the enclosing utterance)
-    def EndTime(self):
-        last_line = self.split_lines_of_utt[self.end_index - 1]
-        last_line_start = float(last_line[2])
-        last_line_duration = float(last_line[3])
-        return last_line_start + (last_line_duration * self.end_keep_proportion) \
-             + self.end_unk_padding
-
-    # Returns the segment length in seconds.
-    def Length(self):
-        return self.EndTime() - self.StartTime()
-
-    def IsWholeUtterance(self):
-        # returns true if this segment corresponds to the whole utterance that
-        # it's a part of (i.e. its start/end time are zero and the end-time of
-        # the last segment.
-        last_line_of_utt = self.split_lines_of_utt[-1]
-        last_line_end_time = float(last_line_of_utt[2]) + float(last_line_of_utt[3])
-        return abs(self.StartTime() - 0.0) < 0.001 and \
-               abs(self.EndTime() - last_line_end_time) < 0.001
-
-    # Returns the proportion of the duration of this segment that consists of
-    # unk-padding and tainted lines of input (will be between 0.0 and 1.0).
-    def JunkProportion(self):
-        # Note: only the first and last lines could possibly be tainted as
-        # that's how we create the segments; and if either or both are tainted
-        # the utterance must contain other lines, so double-counting is not a
-        # problem.
-        junk_duration = self.start_unk_padding + self.end_unk_padding
-        first_split_line = self.split_lines_of_utt[self.start_index]
-        if IsTainted(first_split_line):
-            first_duration = float(first_split_line[3])
-            junk_duration += first_duration * self.start_keep_proportion
-        last_split_line = self.split_lines_of_utt[self.end_index - 1]
-        if IsTainted(last_split_line):
-            last_duration = float(last_split_line[3])
-            junk_duration += last_duration * self.end_keep_proportion
-        return junk_duration / self.Length()
-
-    # This function will remove something from the beginning of the
-    # segment if it's possible to cleanly lop off a bit that contains
-    # more junk, as a proportion of its length, than 'args.junk_proportion'.
-    # Junk is defined as unk-padding and/or tainted segments.
-    # It considers as a potential split point, the first silence
-    # segment or non-tainted non-scored-word segment in the
-    # utterance.  See also TruncateEndForJunkProportion
-    def PossiblyTruncateStartForJunkProportion(self):
-        begin_junk_duration = self.start_unk_padding
-        first_split_line = self.split_lines_of_utt[self.start_index]
-        if IsTainted(first_split_line):
-            first_duration = float(first_split_line[3])
-            begin_junk_duration += first_duration * self.start_keep_proportion
-        if begin_junk_duration == 0.0:
-            # nothing to do.
-            return
-
-        candidate_start_index = None
-        # the following iterates over all lines internal to the utterance.
-        for i in range(self.start_index + 1, self.end_index - 1):
-            this_split_line = self.split_lines_of_utt[i]
-            this_edit_type = this_split_line[7]
-            this_ref_word = this_split_line[6]
-            # We'll consider splitting on silence and on non-scored words.
-            # (i.e. making the silence or non-scored word the left boundary of
-            # the new utterance and discarding the piece to the left of that).
-            if this_edit_type == 'sil' or \
-               (this_edit_type == 'cor' and this_ref_word in non_scored_words):
-                candidate_start_index = i
-                candidate_start_time = float(this_split_line[2])
-                break  # Consider only the first potential truncation.
-        if candidate_start_index == None:
-            return  # Nothing to do as there is no place to split.
-        candidate_removed_piece_duration = candidate_start_time - self.StartTime()
-        if begin_junk_duration / candidate_removed_piece_duration < args.max_junk_proportion:
-            return  # Nothing to do as the candidate piece to remove has too
-                    # little junk.
-        # OK, remove the piece.
-        self.start_index = candidate_start_index
-        self.start_unk_padding = 0.0
-        self.start_keep_proportion = 1.0
-        self.debug_str += ',truncated-start-for-junk'
-
-    # This is like PossiblyTruncateStartForJunkProportion(), but
-    # acts on the end of the segment; see comments there.
-    def PossiblyTruncateEndForJunkProportion(self):
-        end_junk_duration = self.end_unk_padding
-        last_split_line = self.split_lines_of_utt[self.end_index - 1]
-        if IsTainted(last_split_line):
-            last_duration = float(last_split_line[3])
-            end_junk_duration += last_duration * self.end_keep_proportion
-        if end_junk_duration == 0.0:
-            # nothing to do.
-            return
-
-        candidate_end_index = None
-        # the following iterates over all lines internal to the utterance
-        # (starting from the end).
-        for i in reversed(range(self.start_index + 1, self.end_index - 1)):
-            this_split_line = self.split_lines_of_utt[i]
-            this_edit_type = this_split_line[7]
-            this_ref_word = this_split_line[6]
-            # We'll consider splitting on silence and on non-scored words.
-            # (i.e. making the silence or non-scored word the right boundary of
-            # the new utterance and discarding the piece to the right of that).
-            if this_edit_type == 'sil' or \
-               (this_edit_type == 'cor' and this_ref_word in non_scored_words):
-                candidate_end_index = i + 1  # note: end-indexes are one past the last.
-                candidate_end_time = float(this_split_line[2]) + float(this_split_line[3])
-                break  # Consider only the latest potential truncation.
-        if candidate_end_index == None:
-            return  # Nothing to do as there is no place to split.
-        candidate_removed_piece_duration = self.EndTime() - candidate_end_time
-        if end_junk_duration / candidate_removed_piece_duration < args.max_junk_proportion:
-            return  # Nothing to do as the candidate piece to remove has too
-                    # little junk.
-        # OK, remove the piece.
-        self.end_index = candidate_end_index
-        self.end_unk_padding = 0.0
-        self.end_keep_proportion = 1.0
-        self.debug_str += ',truncated-end-for-junk'
-
-
-    # this will return true if there is at least one word in the utterance
-    # that's a scored word (not a non-scored word) and not an OOV word that's
-    # realized as unk.  This becomes a filter on keeping segments.
-    def ContainsAtLeastOneScoredNonOovWord(self):
-        global non_scored_words
-        for i in range(self.start_index, self.end_index):
-            this_split_line = self.split_lines_of_utt[i]
-            this_hyp_word = this_split_line[4]
-            this_ref_word = this_split_line[6]
-            this_edit = this_split_line[7]
-            if this_edit == 'cor' and not this_ref_word in non_scored_words \
-               and this_ref_word == this_hyp_word:
-                return True
-        return False
-
-    # Returns the text corresponding to this utterance, as a string.
-    def Text(self):
-        global oov_symbol
-        text_array = []
-        if self.start_unk_padding != 0.0:
-            text_array.append(oov_symbol)
-        for i in range(self.start_index, self.end_index):
-            this_split_line = self.split_lines_of_utt[i]
-            this_edit = this_split_line[7]
-            this_ref_word = this_split_line[6]
-            if this_ref_word != '<eps>' and this_split_line[-1] != 'do-not-include-in-text':
-                text_array.append(this_ref_word)
-        if self.end_unk_padding != 0.0:
-            text_array.append(oov_symbol)
-        return ' '.join(text_array)
-
-
-# Here, 'text' will be something that indicates the stage of processing,
-# e.g. 'Stage 0: segment cores', 'Stage 1: add tainted lines',
-#, etc.
-def AccumulateSegmentStats(segment_list, text):
-    global segment_total_length, num_segments
-    for segment in segment_list:
-        num_segments[text] += 1
-        segment_total_length[text] += segment.Length()
-
-def PrintSegmentStats():
-    global segment_total_length, num_segments, \
-       num_utterances, num_utterances_without_segments, \
-       total_length_of_utterances
-
-    print('Number of utterances is %d, of which %.2f%% had no segments after '
-          'all processing; total length of data in original utterances (in seconds) '
-          'was %d' % (num_utterances,
-                      num_utterances_without_segments * 100.0 / num_utterances,
-                      total_length_of_utterances),
-          file = sys.stderr)
-
-
-    keys = sorted(segment_total_length.keys())
-    for i in range(len(keys)):
-        key = keys[i]
-        if i > 0:
-            delta_percentage = '[%+.2f%%]' % ((segment_total_length[key] - segment_total_length[keys[i-1]])
-                                              * 100.0 / total_length_of_utterances)
-        print('At %s, num-segments is %d, total length %.2f%% of original total %s' % (
-                key, num_segments[key],
-                segment_total_length[key] * 100.0 / total_length_of_utterances,
-                delta_percentage if i > 0 else ''),
-              file = sys.stderr)
-
-# This function creates the segments for an utterance as a list
-# of class Segment.
-# It returns a 2-tuple (list-of-segments, list-of-deleted-segments)
-# where the deleted segments are only useful for diagnostic printing.
-# Note: split_lines_of_utt is a list of lists, one per line, each containing the
-# sequence of fields.
-def GetSegmentsForUtterance(split_lines_of_utt):
-    global num_utterances, num_utterances_without_segments, total_length_of_utterances
-
-    num_utterances += 1
-
-    segment_ranges = ComputeSegmentCores(split_lines_of_utt)
-
-    utterance_end_time = float(split_lines_of_utt[-1][2]) + float(split_lines_of_utt[-1][3])
-    total_length_of_utterances += utterance_end_time
-
-    segments = [ Segment(split_lines_of_utt, x[0], x[1])
-                 for x in segment_ranges ]
-
-    AccumulateSegmentStats(segments, 'stage  0 [segment cores]')
-    for segment in segments:
-        segment.PossiblyAddTaintedLines()
-    AccumulateSegmentStats(segments, 'stage  1 [add tainted lines]')
-    new_segments = []
-    for s in segments:
-        new_segments += s.PossiblySplitSegment()
-    segments = new_segments
-    AccumulateSegmentStats(segments, 'stage  2 [split segments]')
-    for s in segments:
-        s.PossiblyTruncateBoundaries()
-    AccumulateSegmentStats(segments, 'stage  3 [truncate boundaries]')
-    for s in segments:
-        s.RelaxBoundaryTruncation()
-    AccumulateSegmentStats(segments, 'stage  4 [relax boundary truncation]')
-    for s in segments:
-        s.PossiblyAddUnkPadding()
-    AccumulateSegmentStats(segments, 'stage  5 [unk-padding]')
-
-    deleted_segments = []
-    new_segments = []
-    for s in segments:
-        # the 0.999 allows for roundoff error.
-        if (not s.IsWholeUtterance() and s.Length() < 0.999 * args.min_new_segment_length):
-            s.debug_str += '[deleted-because-of--min-new-segment-length]'
-            deleted_segments.append(s)
-        else:
-            new_segments.append(s)
-    segments = new_segments
-    AccumulateSegmentStats(segments, 'stage  6 [remove new segments under --min-new-segment-length')
-
-    new_segments = []
-    for s in segments:
-        # the 0.999 allows for roundoff error.
-        if s.Length() < 0.999 * args.min_segment_length:
-            s.debug_str += '[deleted-because-of--min-segment-length]'
-            deleted_segments.append(s)
-        else:
-            new_segments.append(s)
-    segments = new_segments
-    AccumulateSegmentStats(segments, 'stage  7 [remove segments under --min-segment-length')
-
-    for s in segments:
-        s.PossiblyTruncateStartForJunkProportion()
-    AccumulateSegmentStats(segments, 'stage  8 [truncate segment-starts for --max-junk-proportion')
-
-    for s in segments:
-        s.PossiblyTruncateEndForJunkProportion()
-    AccumulateSegmentStats(segments, 'stage  9 [truncate segment-ends for --max-junk-proportion')
-
-    new_segments = []
-    for s in segments:
-        if s.ContainsAtLeastOneScoredNonOovWord():
-            new_segments.append(s)
-        else:
-            s.debug_str += '[deleted-because-no-scored-non-oov-words]'
-            deleted_segments.append(s)
-
-    segments = new_segments
-    AccumulateSegmentStats(segments, 'stage 10 [remove segments without scored,non-OOV words]')
-
-    new_segments = []
-    for s in segments:
-        j = s.JunkProportion()
-        if j <= args.max_junk_proportion:
-            new_segments.append(s)
-        else:
-            s.debug_str += '[deleted-because-junk-proportion={0}]'.format(j)
-            deleted_segments.append(s)
-
-    segments = new_segments
-    AccumulateSegmentStats(segments, 'stage 11 [remove segments with junk exceeding --max-junk-proportion]')
-
-    new_segments = []
-    if len(segments) > 0:
-        new_segments.append(segments[0])
-        for i in range(1, len(segments)):
-            if new_segments[-1].EndTime() >= segments[i].StartTime():
-                new_segments[-1].MergeWithSegment(segments[i])
-            else:
-                new_segments.append(segments[i])
-    segments = new_segments
-    AccumulateSegmentStats(segments, 'stage 12 [merge overlapping or touching segments]')
-
-    for i in range(len(segments) - 1):
-        if segments[i].EndTime() > segments[i+1].StartTime():
-            # this just adds something to --ctm-edits-out output
-            segments[i+1].debug_str += ",overlaps-previous-segment"
-
-    if len(segments) == 0:
-        num_utterances_without_segments += 1
-
-    return (segments, deleted_segments)
-
-# this prints a number with a certain number of digits after
-# the point, while removing trailing zeros.
-def FloatToString(f):
-    num_digits = 6 # we want to print 6 digits after the zero
-    g = f
-    while abs(g) > 1.0:
-        g *= 0.1
-        num_digits += 1
-    format_str = '%.{0}g'.format(num_digits)
-    return format_str % f
-
-# Gives time in string form as an exact multiple of the frame-length, e.g. 0.01
-# (after rounding).
-def TimeToString(time, frame_length):
-    n = round(time / frame_length)
-    assert n >= 0
-    # The next function call will remove trailing zeros while printing it, so
-    # that e.g. 0.01 will be printed as 0.01 and not 0.0099999999999999.  It
-    # seems that doing this in a simple way is not really possible (at least,
-    # not without assuming that frame_length is of the form 10^-n, which we
-    # don't really want to do).
-    return FloatToString(n * frame_length)
-
-def WriteSegmentsForUtterance(text_output_handle, segments_output_handle,
-                              old_utterance_name, segments):
-    for n in range(len(segments)):
-        segment = segments[n]
-        # split utterances will be named foo-bar-1 foo-bar-2, etc.
-        new_utterance_name = old_utterance_name + "-" + str(n + 1)
-        # print a line to the text output of the form like
-        # <new-utterance-id> <text>
-        # like:
-        # foo-bar-1 hello this is dan
-        print(new_utterance_name, segment.Text(), file = text_output_handle)
-        # print a line to the segments output of the form
-        # <new-utterance-id> <old-utterance-id> <start-time> <end-time>
-        # like:
-        # foo-bar-1 foo-bar 5.1 7.2
-        print(new_utterance_name, old_utterance_name,
-              TimeToString(segment.StartTime(), args.frame_length),
-              TimeToString(segment.EndTime(), args.frame_length),
-              file = segments_output_handle)
-
-
-
-# Note, this is destrutive of 'segments_for_utterance', but it won't matter.
-def PrintDebugInfoForUtterance(ctm_edits_out_handle,
-                               split_lines_of_cur_utterance,
-                               segments_for_utterance,
-                               deleted_segments_for_utterance):
-    # info_to_print will be list of 2-tuples (time, 'start-segment-n'|'end-segment-n')
-    # representing the start or end times of segments.
-    info_to_print = []
-    for n in range(len(segments_for_utterance)):
-        segment = segments_for_utterance[n]
-        start_string = 'start-segment-' + str(n+1) + '[' + segment.DebugInfo() + ']'
-        info_to_print.append( (segment.StartTime(), start_string) )
-        end_string = 'end-segment-' + str(n+1)
-        info_to_print.append( (segment.EndTime(), end_string) )
-    # for segments that were deleted we print info like start-deleted-segment-1, and
-    # otherwise similar info to segments that were retained.
-    for n in range(len(deleted_segments_for_utterance)):
-        segment = deleted_segments_for_utterance[n]
-        start_string = 'start-deleted-segment-' + str(n+1) + '[' + segment.DebugInfo() + ']'
-        info_to_print.append( (segment.StartTime(), start_string) )
-        end_string = 'end-deleted-segment-' + str(n+1)
-        info_to_print.append( (segment.EndTime(), end_string) )
-
-    info_to_print = sorted(info_to_print)
-
-    for i in range(len(split_lines_of_cur_utterance)):
-        split_line=split_lines_of_cur_utterance[i]
-        split_line[0] += '[' + str(i) + ']'  # add an index like [0], [1], to
-                                             # the utterance-id so we can easily
-                                             # look up segment indexes.
-        start_time = float(split_line[2])
-        end_time = start_time + float(split_line[3])
-        split_line_copy = list(split_line)
-        while len(info_to_print) > 0 and info_to_print[0][0] <= end_time:
-            (segment_start, string) = info_to_print[0]
-            # shift the first element off of info_to_print.
-            info_to_print = info_to_print[1:]
-            # add a field like 'start-segment1[...]=3.21' to what we're about to print.
-            split_line_copy.append(string + "=" + TimeToString(segment_start, args.frame_length))
-        print(' '.join(split_line_copy), file = ctm_edits_out_handle)
-
-# This accumulates word-level stats about, for each reference word, with what
-# probability it will end up in the core of a segment.  Words with low
-# probabilities of being in segments will generally be associated with some kind
-# of error (there is a higher probability of having a wrong lexicon entry).
-def AccWordStatsForUtterance(split_lines_of_utt,
-                             segments_for_utterance):
-    # word_count_pair is a map from a string (the word) to
-    # a list [total-count, count-not-within-segments]
-    global word_count_pair
-    line_is_in_segment = [ False ] * len(split_lines_of_utt)
-    for segment in segments_for_utterance:
-        for i in range(segment.start_index, segment.end_index):
-            line_is_in_segment[i] = True
-    for i in range(len(split_lines_of_utt)):
-        this_ref_word = split_lines_of_utt[i][6]
-        if this_ref_word != '<eps>':
-            word_count_pair[this_ref_word][0] += 1
-            if not line_is_in_segment[i]:
-                word_count_pair[this_ref_word][1] += 1
-
-def PrintWordStats(word_stats_out):
-    try:
-        f = open(word_stats_out, 'w')
-    except:
-        sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} "
-                 "for writing".format(word_stats_out))
-    global word_count_pair
-    # Sort from most to least problematic.  We want to give more prominence to
-    # words that are most frequently not in segments, but also to high-count
-    # words.  Define badness = pair[1] / pair[0], and total_count = pair[0],
-    # where 'pair' is a value of word_count_pair.  We'll reverse sort on
-    # badness^3 * total_count = pair[1]^3 / pair[0]^2.
-    for key, pair in sorted(word_count_pair.items(),
-                      key = lambda item: (item[1][1] ** 3) * 1.0 / (item[1][0] ** 2),
-                      reverse = True):
-        badness = pair[1] * 1.0 / pair[0]
-        total_count = pair[0]
-        print(key, badness, total_count, file = f)
-    try:
-        f.close()
-    except:
-        sys.exit("segment_ctm_edits.py: error closing file --word-stats-out={0} "
-                 "(full disk?)".format(word_stats_out))
-    print("segment_ctm_edits.py: please see the file {0} for word-level statistics "
-          "saying how frequently each word was excluded for a segment; format is "
-          "<word> <proportion-of-time-excluded> <total-count>.  Particularly "
-          "problematic words appear near the top of the file.".format(word_stats_out),
-          file = sys.stderr)
-
-
-def ProcessData():
-    try:
-        f_in = open(args.ctm_edits_in)
-    except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
-                 "file {0}".format(args.ctm_edits_in))
-    try:
-        text_output_handle = open(args.text_out, 'w')
-    except:
-        sys.exit("modify_ctm_edits.py: error opening text output "
-                 "file {0}".format(args.text_out))
-    try:
-        segments_output_handle = open(args.segments_out, 'w')
-    except:
-        sys.exit("modify_ctm_edits.py: error opening segments output "
-                 "file {0}".format(args.text_out))
-    if args.ctm_edits_out != None:
-        try:
-            ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
-        except:
-            sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
-                     "file {0}".format(args.ctm_edits_out))
-
-    # Most of what we're doing in the lines below is splitting the input lines
-    # and grouping them per utterance, before giving them to ProcessUtterance()
-    # and then printing the modified lines.
-    first_line = f_in.readline()
-    if first_line == '':
-        sys.exit("modify_ctm_edits.py: empty input")
-    split_pending_line = first_line.split()
-    if len(split_pending_line) == 0:
-        sys.exit("modify_ctm_edits.py: bad input line " + first_line)
-    cur_utterance = split_pending_line[0]
-    split_lines_of_cur_utterance = []
-
-    while True:
-        if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance:
-            (segments_for_utterance,
-             deleted_segments_for_utterance) = GetSegmentsForUtterance(split_lines_of_cur_utterance)
-            AccWordStatsForUtterance(split_lines_of_cur_utterance, segments_for_utterance)
-            WriteSegmentsForUtterance(text_output_handle, segments_output_handle,
-                                      cur_utterance, segments_for_utterance)
-            if args.ctm_edits_out != None:
-                PrintDebugInfoForUtterance(ctm_edits_output_handle,
-                                           split_lines_of_cur_utterance,
-                                           segments_for_utterance,
-                                           deleted_segments_for_utterance)
-            split_lines_of_cur_utterance = []
-            if len(split_pending_line) == 0:
-                break
-            else:
-                cur_utterance = split_pending_line[0]
-
-        split_lines_of_cur_utterance.append(split_pending_line)
-        next_line = f_in.readline()
-        split_pending_line = next_line.split()
-        if len(split_pending_line) == 0:
-            if next_line != '':
-                sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
-    try:
-        text_output_handle.close()
-        segments_output_handle.close()
-        if args.ctm_edits_out != None:
-            ctm_edits_output_handle.close()
-    except:
-        sys.exit("modify_ctm_edits.py: error closing one or more outputs "
-                 "(broken pipe or full disk?)")
-
-
-def ReadNonScoredWords(non_scored_words_file):
-    global non_scored_words
-    try:
-        f = open(non_scored_words_file)
-    except:
-        sys.exit("modify_ctm_edits.py: error opening file: "
-                 "--non-scored-words=" + non_scored_words_file)
-    for line in f.readlines():
-        a = line.split()
-        if not len(line.split()) == 1:
-            sys.exit("modify_ctm_edits.py: bad line in non-scored-words "
-                     "file {0}: {1}".format(non_scored_words_file, line))
-        non_scored_words.add(a[0])
-    f.close()
-
-
-
-
-non_scored_words = set()
-ReadNonScoredWords(args.non_scored_words_in)
-
-oov_symbol = None
-if args.oov_symbol_file != None:
-    try:
-        with open(args.oov_symbol_file) as f:
-            line = f.readline()
-            assert len(line.split()) == 1
-            oov_symbol = line.split()[0]
-            assert f.readline() == ''
-    except Exception as e:
-        sys.exit("segment_ctm_edits.py: error reading file --oov-symbol-file=" +
-                 args.oov_symbol_file + ", error is: " + str(e))
-elif args.unk_padding != 0.0:
-    sys.exit("segment_ctm_edits.py: if the --unk-padding option is nonzero (which "
-             "it is by default, the --oov-symbol-file option must be supplied.")
-
-# segment_total_length and num_segments are maps from
-# 'stage' strings; see AccumulateSegmentStats for details.
-segment_total_length = defaultdict(int)
-num_segments = defaultdict(int)
-# the lambda expression below is an anonymous function that takes no arguments
-# and returns the new list [0, 0].
-word_count_pair = defaultdict(lambda: [0, 0])
-num_utterances = 0
-num_utterances_without_segments = 0
-total_length_of_utterances = 0
-
-
-ProcessData()
-PrintSegmentStats()
-if args.word_stats_out != None:
-    PrintWordStats(args.word_stats_out)
-if args.ctm_edits_out != None:
-    print("segment_ctm_edits.py: detailed utterance-level debug information "
-          "is in " + args.ctm_edits_out, file = sys.stderr)
-
diff --git a/egs/wsj/s5/steps/cleanup/taint_ctm_edits.py b/egs/wsj/s5/steps/cleanup/taint_ctm_edits.py
deleted file mode 100755
index c763d7191a1..00000000000
--- a/egs/wsj/s5/steps/cleanup/taint_ctm_edits.py
+++ /dev/null
@@ -1,242 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2016   Vimal Manohar
-#           2016   Johns Hopkins University (author: Daniel Povey)
-# Apache 2.0
-
-from __future__ import print_function
-import sys, operator, argparse, os
-from collections import defaultdict
-
-# This script reads and writes the 'ctm-edits' file that is
-# produced by get_ctm_edits.py.
-#
-# It is to be applied after modify_ctm_edits.py.  Its function is to add, in
-# certain circumstances, an optional extra field with the word 'tainted' to the
-# ctm-edits format, e.g an input line like:
-#
-# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil
-# might become:
-# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil tainted
-#
-# It also deletes certain lines, representing deletions, from the ctm (if they
-# were next to taintable lines... their presence could then be inferred from the
-# 'tainted' flag).
-#
-# You should interpret the 'tainted' flag as "we're not sure what's going on here;
-# don't trust this."
-#
-# One of the problem this script is trying to solve is that if we have errors
-# that are adjacent to silence or non-scored words
-# it's not at all clear whether the silence or non-scored words were really such,
-# or might have contained actual words.
-# Also, if we have words in the reference that were realized as '<unk>' in the
-# hypothesis, and they are adjacent to errors, it's almost always the case
-# that the '<unk>' doesn't really correspond to the word in the reference, so
-# we mark these as 'tainted'.
-#
-# The rule for tainting is quite simple; see the code.
-
-
-
-parser = argparse.ArgumentParser(
-    description = "This program modifies the ctm-edits format to identify "
-    "silence and 'fixed' non-scored-word lines, and lines where the hyp is "
-    "<unk> and the reference is a real but OOV word, where there is a relatively "
-    "high probability that something is going wrong so we shouldn't trust "
-    "this line.  It adds the field 'tainted' to such "
-    "lines.  Lines in the ctm representing deletions from the reference will "
-    "be removed if they have 'tainted' adjacent lines (since it won't be clear "
-    "where such reference words were really realized, if at all). "
-    "See comments at the top of the script for more information.")
-
-parser.add_argument("--verbose", type = int, default = 1,
-                    choices=[0,1,2,3],
-                    help = "Verbose level, higher = more verbose output")
-parser.add_argument("ctm_edits_in", metavar = "<ctm-edits-in>",
-                    help = "Filename of input ctm-edits file. "
-                    "Use /dev/stdin for standard input.")
-parser.add_argument("ctm_edits_out", metavar = "<ctm-edits-out>",
-                    help = "Filename of output ctm-edits file. "
-                    "Use /dev/stdout for standard output.")
-
-args = parser.parse_args()
-
-
-
-# This function is the core of the program, that does the tainting and
-# removes some lines representing deletions.
-# split_lines_of_utt is a list of lists, one per line, each containing the
-# sequence of fields.  Returns the same format of data after processing to add
-# the 'tainted' field.  Note: this function is destructive of its input; the
-# input will not have the same value afterwards.
-def ProcessUtterance(split_lines_of_utt):
-    global num_lines_of_type, num_tainted_lines, \
-           num_del_lines_giving_taint, num_sub_lines_giving_taint, \
-           num_ins_lines_giving_taint
-
-    # work out whether each line is taintable [i.e. silence or fix or unk replacing
-    # real-word].
-    taintable = [ False ] * len(split_lines_of_utt)
-    for i in range(len(split_lines_of_utt)):
-        edit_type = split_lines_of_utt[i][7]
-        if edit_type == 'sil' or edit_type == 'fix':
-            taintable[i] = True
-        elif edit_type == 'cor' and split_lines_of_utt[i][4] != split_lines_of_utt[i][6]:
-            # this is the case when <unk> replaces a real word that was out of
-            # the vocabulary; we mark it as correct because such words do
-            # translate to <unk> if we don't have a pronunciations.  However we
-            # don't have good confidence that the alignments of such words are
-            # accurate if they are adjacent to errors.
-            taintable[i] = True
-
-
-    for i in range(len(split_lines_of_utt)):
-        edit_type = split_lines_of_utt[i][7]
-        num_lines_of_type[edit_type] += 1
-        if edit_type == 'del' or edit_type == 'sub' or edit_type == 'ins':
-            tainted_an_adjacent_line = False
-            # First go backwards tainting lines
-            j = i - 1
-            while j >= 0 and taintable[j]:
-                tainted_an_adjacent_line = True
-                if len(split_lines_of_utt[j]) == 8:
-                    num_tainted_lines += 1
-                    split_lines_of_utt[j].append('tainted')
-                j -= 1
-            # Next go forwards tainting lines
-            j = i + 1
-            while j < len(split_lines_of_utt) and taintable[j]:
-                tainted_an_adjacent_line = True
-                if len(split_lines_of_utt[j]) == 8:
-                    num_tainted_lines += 1
-                    split_lines_of_utt[j].append('tainted')
-                j += 1
-            if tainted_an_adjacent_line:
-                if edit_type == 'del':
-                    split_lines_of_utt[i][7] = 'remove-this-line'
-                    num_del_lines_giving_taint += 1
-                elif edit_type == 'sub':
-                    num_sub_lines_giving_taint += 1
-                else:
-                    num_ins_lines_giving_taint += 1
-
-    new_split_lines_of_utt = []
-    for i in range(len(split_lines_of_utt)):
-        if split_lines_of_utt[i][7] != 'remove-this-line':
-            new_split_lines_of_utt.append(split_lines_of_utt[i])
-    return new_split_lines_of_utt
-
-
-def ProcessData():
-    try:
-        f_in = open(args.ctm_edits_in)
-    except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
-                 "file {0}".format(args.ctm_edits_in))
-    try:
-        f_out = open(args.ctm_edits_out, 'w')
-    except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
-                 "file {0}".format(args.ctm_edits_out))
-    num_lines_processed = 0
-
-
-    # Most of what we're doing in the lines below is splitting the input lines
-    # and grouping them per utterance, before giving them to ProcessUtterance()
-    # and then printing the modified lines.
-    first_line = f_in.readline()
-    if first_line == '':
-        sys.exit("modify_ctm_edits.py: empty input")
-    split_pending_line = first_line.split()
-    if len(split_pending_line) == 0:
-        sys.exit("modify_ctm_edits.py: bad input line " + first_line)
-    cur_utterance = split_pending_line[0]
-    split_lines_of_cur_utterance = []
-
-    while True:
-        if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance:
-            split_lines_of_cur_utterance = ProcessUtterance(split_lines_of_cur_utterance)
-            for split_line in split_lines_of_cur_utterance:
-                print(' '.join(split_line), file = f_out)
-            split_lines_of_cur_utterance = []
-            if len(split_pending_line) == 0:
-                break
-            else:
-                cur_utterance = split_pending_line[0]
-
-        split_lines_of_cur_utterance.append(split_pending_line)
-        next_line = f_in.readline()
-        split_pending_line = next_line.split()
-        if len(split_pending_line) == 0:
-            if next_line != '':
-                sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
-    try:
-        f_out.close()
-    except:
-        sys.exit("taint_ctm_edits.py: error closing ctm-edits output "
-                 "(broken pipe or full disk?)")
-
-def PrintNonScoredStats():
-    if args.verbose < 1:
-        return
-    if num_lines == 0:
-        print("modify_ctm_edits.py: processed no input.", file = sys.stderr)
-    num_lines_modified = sum(ref_change_stats.values())
-    num_incorrect_lines = num_lines - num_correct_lines
-    percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
-    percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
-    percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines)
-    print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
-          "of which {2} were changed fixing the reference for non-scored words "
-          "({3}% of lines, or {4}% of incorrect lines)".format(
-            num_lines, percent_lines_incorrect, num_lines_modified,
-            percent_modified, percent_of_incorrect_modified),
-          file = sys.stderr)
-
-    keys = sorted(ref_change_stats.keys(), reverse=True,
-                  key = lambda x: ref_change_stats[x])
-    num_keys_to_print = 40 if args.verbose >= 2 else 10
-
-    print("modify_ctm_edits.py: most common edits (as percentages "
-          "of all such edits) are:\n" +
-          ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified)
-                     for k in keys[0:num_keys_to_print]]))
-          + '\n...'if num_keys_to_print < len(keys) else '',
-          file = sys.stderr)
-
-
-def PrintStats():
-    tot_lines = sum(num_lines_of_type.values())
-    if args.verbose < 1 or tot_lines == 0:
-        return
-    print("taint_ctm_edits.py: processed {0} input lines, whose edit-types were: ".format(tot_lines) +
-          ', '.join([ '%s = %.2f%%' % (k, num_lines_of_type[k] * 100.0 / tot_lines)
-                      for k in sorted(num_lines_of_type.keys(), reverse = True,
-                                      key = lambda k: num_lines_of_type[k])  ]),
-          file = sys.stderr)
-
-
-    del_giving_taint_percent = num_del_lines_giving_taint * 100.0 / tot_lines
-    sub_giving_taint_percent = num_sub_lines_giving_taint * 100.0 / tot_lines
-    ins_giving_taint_percent = num_ins_lines_giving_taint * 100.0 / tot_lines
-    tainted_lines_percent = num_tainted_lines * 100.0 / tot_lines
-
-    print("taint_ctm_edits.py: as a percentage of all lines, (%.2f%%, %.2f%%, %.2f%%) were "
-          "(deletions, substitutions, insertions) that tainted adjacent lines.  %.2f%% of all "
-          "lines were tainted." % (del_giving_taint_percent, sub_giving_taint_percent,
-                                   ins_giving_taint_percent, tainted_lines_percent),
-          file = sys.stderr)
-
-
-
-# num_lines_of_type will map from line-type ('cor', 'sub', etc.) to count.
-num_lines_of_type = defaultdict(int)
-num_tainted_lines = 0
-num_del_lines_giving_taint = 0
-num_sub_lines_giving_taint = 0
-num_ins_lines_giving_taint = 0
-
-ProcessData()
-PrintStats()
-
diff --git a/egs/wsj/s5/utils/lang/make_phone_lm.py b/egs/wsj/s5/utils/lang/make_phone_lm.py
index 28ed7d3426b..47d2a45d229 100755
--- a/egs/wsj/s5/utils/lang/make_phone_lm.py
+++ b/egs/wsj/s5/utils/lang/make_phone_lm.py
@@ -151,7 +151,7 @@ def AddRawCountsFromLine(self, line):
         try:
             words = [self.bos_symbol] + [ int(x) for x in line.split() ] + [self.eos_symbol]
         except:
-            sys.exit("make_one_biased_lm.py: bad input line {0} (expected a sequence "
+            sys.exit("make_phone_lm.py: bad input line {0} (expected a sequence "
                      "of integers)".format(line))
 
         for n in range(1, len(words)):
@@ -170,7 +170,7 @@ def AddRawCountsFromStandardInput(self):
             self.AddRawCountsFromLine(line)
             lines_processed += 1
         if lines_processed == 0 or args.verbose > 0:
-            print("make_one_biased_lm.py: processed {0} lines of input".format(
+            print("make_phone_lm.py: processed {0} lines of input".format(
                     lines_processed), file = sys.stderr)