kaldi-asr · danpovey · Oct 29, 2016 · Oct 29, 2016 · Oct 29, 2016 · Oct 29, 2016
diff --git a/egs/wsj/s5/steps/cleanup/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/get_ctm_edits.py
diff --git a/egs/wsj/s5/steps/cleanup/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/get_non_scored_words.py
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
@@ -275,11 +275,20 @@ def OutputCtm(utterance_id, edits_array, ctm_array):
 def ProcessOneUtterance(utterance_id, edits_line, ctm_lines):
     try:
         # Remove the utterance-id from the beginning of the edits line
-        edits_line = edits_line[len(utterance_id) + 1:]
+        edits_fields = edits_line[len(utterance_id) + 1:]
 
-        # e.g. if edits_line is now 'i i ; see be ; my my ', edits_array will become
+        # e.g. if edits_fields is now 'i i ; see be ; my my ', edits_array will become
         #  [ ['i', 'i'], ['see', 'be'], ['my', 'my'] ]
-        edits_array = [ x.split() for x in edits_line.split(";") ]
+        fields_split = edits_fields.split()
+        first_fields, second_fields = fields_split[0::3], fields_split[1::3]
+        if (
+            len(first_fields) != len(second_fields) or
+            (len(fields_split) >= 3 and set(fields_split[2::3]) != {';'})
+        ):
+            sys.exit("get_ctm_edits.py: could not make sense of edits line: " + edits_line)
+
+        edits_array = list(zip(first_fields, second_fields))
+
         # ctm_array will now become something like [ ['1', '1.010', '0.240', 'little ' ], ... ]
         ctm_array = [ x.split() for x in ctm_lines ]
         ctm_array = []

diff --git a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
@@ -39,19 +39,19 @@ def ReadLang(lang_dir):
     global non_scored_words
 
     if not os.path.isdir(lang_dir):
-        sys.exit("modify_ctm_edits.py expected lang/ directory {0} to "
+        sys.exit("get_non_scored_words.py expected lang/ directory {0} to "
                  "exist.".format(lang_dir))
     for f in [ '/words.txt', '/phones/silence.int', '/phones/align_lexicon.int' ]:
         if not os.path.exists(lang_dir + f):
-            sys.exit("modify_ctm_edits.py: expected file {0}{1} to exist.".format(
+            sys.exit("get_non_scored_words.py: expected file {0}{1} to exist.".format(
                     lang_dir, f))
     # read silence-phones.
     try:
         silence_phones = set()
         for line in open(lang_dir + '/phones/silence.int').readlines():
             silence_phones.add(int(line))
     except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
+        sys.exit("get_non_scored_words.py: problem reading file "
                  "{0}/phones/silence.int: {1}".format(lang_dir, str(e)))
 
     # read align_lexicon.int.
@@ -67,7 +67,7 @@ def ReadLang(lang_dir):
                     int(a[2]) in silence_phones:
                 silence_word_ints.add(int(a[0]))
     except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
+        sys.exit("get_non_scored_words.py: problem reading file "
                  "{0}/phones/align_lexicon.int: "
                  "{1}".format(lang_dir, str(e)))
 
@@ -77,11 +77,11 @@ def ReadLang(lang_dir):
             if int(integer) in silence_word_ints:
                 non_scored_words.add(word)
     except Exception as e:
-        sys.exit("modify_ctm_edits.py: problem reading file "
+        sys.exit("get_non_scored_words.py: problem reading file "
                  "{0}/words.txt.int: {1}".format(lang_dir, str(e)))
 
     if not len(non_scored_words) == len(silence_word_ints):
-        sys.exit("modify_ctm_edits.py: error getting silence words, len({0}) != len({1})",
+        sys.exit("get_non_scored_words.py: error getting silence words, len({0}) != len({1})",
                  str(non_scored_words), str(silence_word_ints))
     for word in non_scored_words:
         print(word)

diff --git a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
@@ -52,7 +52,7 @@
 
 parser = argparse.ArgumentParser(
     description = "This program modifies the reference in the ctm-edits which "
-    "is output by steps/cleanup/get_ctm_edits.py, to allow insertions, deletions and "
+    "is output by steps/cleanup/internal/get_ctm_edits.py, to allow insertions, deletions and "
     "substitutions of non-scored words, and [if --allow-repetitions=true], "
     "duplications of single words or pairs of scored words (to account for dysfluencies "
     "that were not transcribed).  Note: deletions and substitutions of non-scored words "

diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
@@ -912,34 +912,34 @@ def ProcessData():
     try:
         f_in = open(args.ctm_edits_in)
     except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
+        sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
         text_output_handle = open(args.text_out, 'w')
     except:
-        sys.exit("modify_ctm_edits.py: error opening text output "
+        sys.exit("segment_ctm_edits.py: error opening text output "
                  "file {0}".format(args.text_out))
     try:
         segments_output_handle = open(args.segments_out, 'w')
     except:
-        sys.exit("modify_ctm_edits.py: error opening segments output "
+        sys.exit("segment_ctm_edits.py: error opening segments output "
                  "file {0}".format(args.text_out))
     if args.ctm_edits_out != None:
         try:
             ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
         except:
-            sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
+            sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
                      "file {0}".format(args.ctm_edits_out))
 
     # Most of what we're doing in the lines below is splitting the input lines
     # and grouping them per utterance, before giving them to ProcessUtterance()
     # and then printing the modified lines.
     first_line = f_in.readline()
     if first_line == '':
-        sys.exit("modify_ctm_edits.py: empty input")
+        sys.exit("segment_ctm_edits.py: empty input")
     split_pending_line = first_line.split()
     if len(split_pending_line) == 0:
-        sys.exit("modify_ctm_edits.py: bad input line " + first_line)
+        sys.exit("segment_ctm_edits.py: bad input line " + first_line)
     cur_utterance = split_pending_line[0]
     split_lines_of_cur_utterance = []
 
@@ -966,14 +966,14 @@ def ProcessData():
         split_pending_line = next_line.split()
         if len(split_pending_line) == 0:
             if next_line != '':
-                sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
+                sys.exit("segment_ctm_edits.py: got an empty or whitespace input line")
     try:
         text_output_handle.close()
         segments_output_handle.close()
         if args.ctm_edits_out != None:
             ctm_edits_output_handle.close()
     except:
-        sys.exit("modify_ctm_edits.py: error closing one or more outputs "
+        sys.exit("segment_ctm_edits.py: error closing one or more outputs "
                  "(broken pipe or full disk?)")
 
 
@@ -982,12 +982,12 @@ def ReadNonScoredWords(non_scored_words_file):
     try:
         f = open(non_scored_words_file)
     except:
-        sys.exit("modify_ctm_edits.py: error opening file: "
+        sys.exit("segment_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
     for line in f.readlines():
         a = line.split()
         if not len(line.split()) == 1:
-            sys.exit("modify_ctm_edits.py: bad line in non-scored-words "
+            sys.exit("segment_ctm_edits.py: bad line in non-scored-words "
                      "file {0}: {1}".format(non_scored_words_file, line))
         non_scored_words.add(a[0])
     f.close()

diff --git a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
@@ -132,12 +132,12 @@ def ProcessData():
     try:
         f_in = open(args.ctm_edits_in)
     except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
+        sys.exit("taint_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
         f_out = open(args.ctm_edits_out, 'w')
     except:
-        sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
+        sys.exit("taint_ctm_edits.py: error opening ctm-edits output "
                  "file {0}".format(args.ctm_edits_out))
     num_lines_processed = 0
 
@@ -147,10 +147,10 @@ def ProcessData():
     # and then printing the modified lines.
     first_line = f_in.readline()
     if first_line == '':
-        sys.exit("modify_ctm_edits.py: empty input")
+        sys.exit("taint_ctm_edits.py: empty input")
     split_pending_line = first_line.split()
     if len(split_pending_line) == 0:
-        sys.exit("modify_ctm_edits.py: bad input line " + first_line)
+        sys.exit("taint_ctm_edits.py: bad input line " + first_line)
     cur_utterance = split_pending_line[0]
     split_lines_of_cur_utterance = []
 
@@ -170,7 +170,7 @@ def ProcessData():
         split_pending_line = next_line.split()
         if len(split_pending_line) == 0:
             if next_line != '':
-                sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
+                sys.exit("taint_ctm_edits.py: got an empty or whitespace input line")
     try:
         f_out.close()
     except:
@@ -181,13 +181,13 @@ def PrintNonScoredStats():
     if args.verbose < 1:
         return
     if num_lines == 0:
-        print("modify_ctm_edits.py: processed no input.", file = sys.stderr)
+        print("taint_ctm_edits.py: processed no input.", file = sys.stderr)
     num_lines_modified = sum(ref_change_stats.values())
     num_incorrect_lines = num_lines - num_correct_lines
     percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
     percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
     percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines)
-    print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
+    print("taint_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
           "of which {2} were changed fixing the reference for non-scored words "
           "({3}% of lines, or {4}% of incorrect lines)".format(
             num_lines, percent_lines_incorrect, num_lines_modified,
@@ -198,7 +198,7 @@ def PrintNonScoredStats():
                   key = lambda x: ref_change_stats[x])
     num_keys_to_print = 40 if args.verbose >= 2 else 10
 
-    print("modify_ctm_edits.py: most common edits (as percentages "
+    print("taint_ctm_edits.py: most common edits (as percentages "
           "of all such edits) are:\n" +
           ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified)
                      for k in keys[0:num_keys_to_print]]))

diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
@@ -192,7 +192,7 @@ if [ $stage -le 5 ]; then
 
   $cmd $dir/log/get_ctm_edits.log \
     align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:-  \| \
-      steps/cleanup/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
+      steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
        /dev/stdin $dir/ctm $dir/ctm_edits || exit 1
 
   echo "$0: ctm with edits information appended is in $dir/ctm_edits"