kaldi-asr · danpovey · Mar 1, 2019 · Feb 25, 2019 · Feb 27, 2019
diff --git a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
@@ -127,7 +127,7 @@ def read_text(text_file):
                 "Did not get enough columns; line {0} in {1}"
                 "".format(line, text_file.name))
         elif len(parts) == 1:
-            logger.warn("Empty transcript for utterance %s in %s", 
+            logger.warn("Empty transcript for utterance %s in %s",
                         parts[0], text_file.name)
             yield parts[0], []
         else:

diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -116,17 +116,17 @@
 def OpenFiles():
     global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word
     try:
-        ctm_edits_out = open(args.ctm_edits_out, 'w')
+        ctm_edits_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format(
                 args.ctm_edits_out))
     try:
-        edits_in = open(args.edits_in)
+        edits_in = open(args.edits_in, encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format(
                 args.edits_in))
     try:
-        ctm_in = open(args.ctm_in)
+        ctm_in = open(args.ctm_in, encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format(
                 args.ctm_in))
@@ -138,7 +138,7 @@ def OpenFiles():
             print("get_ctm_edits.py: error: if you set the the --symbol-table option "
                   "you must also set the --oov option", file = sys.stderr)
         try:
-            f = open(args.symbol_table, 'r')
+            f = open(args.symbol_table, 'r', encoding='utf-8')
             for line in f.readlines():
                 [ word, integer ] = line.split()
                 if int(integer) == args.oov:

diff --git a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -90,7 +90,7 @@ def read_lang(lang_dir):
         raise
 
     try:
-        for line in open(lang_dir + '/words.txt').readlines():
+        for line in open(lang_dir + '/words.txt', encoding='utf-8').readlines():
             [ word, integer ] = line.split()
             if int(integer) in silence_word_ints:
                 non_scored_words.add(word)

diff --git a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
@@ -75,22 +75,22 @@ def ReadEntries(file_handle):
 # Each entry in the list represents the pronounciation candidate(s) of a word.
 # For each non-<eps> word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g:
 # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
-# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left", 
+# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left",
 # which includes phones before the first silphone, and "nonsil_right", which includes
-# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL', 
+# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL',
 # nonsil_left is 'V' and nonsil_right is empty ''. After processing an <eps> entry
 # in ctm_prons, we put it in "info" as an entry:  [utt_id, word, nonsil_right]
 # only if it's nonsil_right segment is not empty, which may be used when processing
 # the next word.
-# 
+#
 # Normally, one non-<eps> word is only aligned to one pronounciation candidate. However
 # when there is a preceding/following <eps>, like in the following example, we
 # assume the phones aligned to <eps> should be statistically distributed
 # to its neighboring words (BTW we assume there are no consecutive <eps> within an utterance.)
 # Thus we append the "nonsil_left" segment of these phones to the pronounciation
 # of the preceding word, if the last phone of this pronounciation is not a silence phone,
 # Similarly we can add a pron candidate to the following word.
-# 
+#
 # For example, for the following part of a ctm_prons file:
 # 911Mothers_2010W-0010916-0012901-1 other AH DH ER
 # 911Mothers_2010W-0010916-0012901-1 <eps> K AH N SIL B
@@ -99,11 +99,11 @@ def ReadEntries(file_handle):
 # 911Mothers_2010W-0010916-0012901-1 when W EH N
 # 911Mothers_2010W-0010916-0012901-1 people P IY P AH L
 # 911Mothers_2010W-0010916-0012901-1 <eps> SIL
-# 911Mothers_2010W-0010916-0012901-1 heard HH ER 
+# 911Mothers_2010W-0010916-0012901-1 heard HH ER
 # 911Mothers_2010W-0010916-0012901-1 <eps> D
 # 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T
 # 911Mothers_2010W-0010916-0012901-1 my M AY
-# 
+#
 # The corresponding segment in the "info" list is:
 # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
 # [911Mothers_2010W-0010916-0012901-1, <eps>, 'B'
@@ -113,7 +113,7 @@ def ReadEntries(file_handle):
 # [911Mothers_2010W-0010916-0012901-1, <eps>, 'D']
 # [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')]
 # [911Mothers_2010W-0010916-0012901-1, my, set('M AY')]
-# 
+#
 # Then we accumulate pronouciation stats from "info". Basically, for each occurence
 # of a word, each pronounciation candidate gets equal soft counts. e.g. In the above
 # example, each pron candidate of "because" gets a count of 1/4. The stats is stored
@@ -139,20 +139,20 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
         # So we apply the same merging method in these cases.
         if word == '<eps>' or (word in non_scored_words and word != '<unk>' and word != '<UNK>'):
             nonsil_left = []
-            nonsil_right = [] 
+            nonsil_right = []
             for phone in phones:
                 if phone in silphones:
                     break
                 nonsil_left.append(phone)
-            
+
             for phone in reversed(phones):
                 if phone in silphones:
                     break
                 nonsil_right.insert(0, phone)
-            
+
             # info[-1][0] is the utt_id of the last entry
-            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]: 
-                # pron_ext is a set of extended pron candidates. 
+            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]:
+                # pron_ext is a set of extended pron candidates.
                 pron_ext = set()
                 # info[-1][2] is the set of pron candidates of the last entry.
                 for pron in info[-1][2]:
@@ -211,7 +211,7 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
                 stats[(word, phones)] = stats.get((word, phones), 0) + count
     return stats
 
-def WriteStats(stats, file_handle):            
+def WriteStats(stats, file_handle):
     for word_pron, count in stats.items():
         print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle)
     file_handle.close()
@@ -222,7 +222,7 @@ def Main():
     non_scored_words = ReadEntries(args.non_scored_words_file_handle)
     optional_silence = ReadEntries(args.optional_silence_file_handle)
     stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle)
-    WriteStats(stats, args.stats_file_handle)            
+    WriteStats(stats, args.stats_file_handle)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -142,16 +142,18 @@ def CompletelyDiscountLowCountStates(self, min_count):
         hist_to_total_count = self.GetHistToTotalCount()
         for n in reversed(list(range(2, self.ngram_order))):
             this_order_counts = self.counts[n]
+            to_delete = []
             for hist in this_order_counts.keys():
                 if hist_to_total_count[hist] < min_count:
                     # we need to completely back off this count.
                     word_to_count = this_order_counts[hist]
-                    del this_order_counts[hist] # delete the key from the dict.
+                    # mark this key for deleting
+                    to_delete.append(hist)
                     backoff_hist = hist[1:]  # this will be a tuple not a list.
                     for word, count in word_to_count.items():
                         self.AddCount(backoff_hist, word, count)
-
-
+            for hist in to_delete:
+                del this_order_counts[hist]
 
     # This backs off the counts according to Kneser-Ney (unmodified,
     # with interpolation).
@@ -200,7 +202,7 @@ def AddTopWords(self, top_words_file):
         word_to_count = self.counts[0][empty_history]
         total = sum(word_to_count.values())
         try:
-            f = open(top_words_file)
+            f = open(top_words_file, mode='r', encoding='utf-8')
         except:
             sys.exit("make_one_biased_lm.py: error opening top-words file: "
                      "--top-words=" + top_words_file)

diff --git a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -105,7 +105,7 @@
 def ReadNonScoredWords(non_scored_words_file):
     global non_scored_words
     try:
-        f = open(non_scored_words_file)
+        f = open(non_scored_words_file, encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
@@ -317,12 +317,12 @@ def ProcessUtterance(split_lines_of_utt):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        f_out = open(args.ctm_edits_out, 'w')
+        f_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
                  "file {0}".format(args.ctm_edits_out))

diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -894,7 +895,7 @@ def AccWordStatsForUtterance(split_lines_of_utt,
 
 def PrintWordStats(word_stats_out):
     try:
-        f = open(word_stats_out, 'w')
+        f = open(word_stats_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} "
                  "for writing".format(word_stats_out))
@@ -924,23 +925,23 @@ def PrintWordStats(word_stats_out):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        text_output_handle = open(args.text_out, 'w')
+        text_output_handle = open(args.text_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening text output "
                  "file {0}".format(args.text_out))
     try:
-        segments_output_handle = open(args.segments_out, 'w')
+        segments_output_handle = open(args.segments_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening segments output "
                  "file {0}".format(args.text_out))
     if args.ctm_edits_out != None:
         try:
-            ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
+            ctm_edits_output_handle = open(args.ctm_edits_out, 'w', encoding='utf-8')
         except:
             sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
                      "file {0}".format(args.ctm_edits_out))
@@ -994,7 +995,7 @@ def ProcessData():
 def ReadNonScoredWords(non_scored_words_file):
     global non_scored_words
     try:
-        f = open(non_scored_words_file)
+        f = open(non_scored_words_file, encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
@@ -1015,7 +1016,7 @@ def ReadNonScoredWords(non_scored_words_file):
 oov_symbol = None
 if args.oov_symbol_file != None:
     try:
-        with open(args.oov_symbol_file) as f:
+        with open(args.oov_symbol_file, encoding='utf-8') as f:
             line = f.readline()
             assert len(line.split()) == 1
             oov_symbol = line.split()[0]

diff --git a/egs/wsj/s5/steps/cleanup/make_biased_lms.py b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 from __future__ import print_function
 import sys
@@ -55,21 +55,23 @@ def ProcessGroupOfLines(group_of_lines):
     try:
         command = "steps/cleanup/internal/make_one_biased_lm.py " + args.lm_opts
         p = subprocess.Popen(command, shell = True, stdin = subprocess.PIPE,
-                            stdout = sys.stdout, stderr = sys.stderr)
+                             stdout = sys.stdout, stderr = sys.stderr)
         for line in group_of_lines:
             a = line.split()
             if len(a) == 0:
                 sys.exit("make_biased_lms.py: empty input line")
             utterance_id = a[0]
             # print <utt> <utt-group> to utterance-map file
             print(utterance_id, group_utterance_id, file = utterance_map_file)
-            rest_of_line = ' '.join(a[1:])  # get rid of utterance id.
-            print(rest_of_line, file=p.stdin)
+            rest_of_line = ' '.join(a[1:]) + '\n' # get rid of utterance id.
+            p.stdin.write(rest_of_line.encode('utf-8'))
         p.stdin.close()
         assert p.wait() == 0
-    except Exception as e:
-        sys.exit("make_biased_lms.py: error calling subprocess, command was: " +
-                 command + ", error was : " + str(e))
+    except Exception:
+        sys.stderr.write(
+            "make_biased_lms.py: error calling subprocess, command was: " +
+            command)
+        raise
     # Print a blank line; this terminates the FST in the Kaldi fst-archive
     # format.
     print("")