Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def read_text(text_file):
"Did not get enough columns; line {0} in {1}"
"".format(line, text_file.name))
elif len(parts) == 1:
logger.warn("Empty transcript for utterance %s in %s",
logger.warn("Empty transcript for utterance %s in %s",
parts[0], text_file.name)
yield parts[0], []
else:
Expand Down
10 changes: 5 additions & 5 deletions egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

# Copyright 2016 Vimal Manohar
# 2016 Johns Hopkins University (author: Daniel Povey)
Expand Down Expand Up @@ -116,17 +116,17 @@
def OpenFiles():
global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word
try:
ctm_edits_out = open(args.ctm_edits_out, 'w')
ctm_edits_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
except:
sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format(
args.ctm_edits_out))
try:
edits_in = open(args.edits_in)
edits_in = open(args.edits_in, encoding='utf-8')
except:
sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format(
args.edits_in))
try:
ctm_in = open(args.ctm_in)
ctm_in = open(args.ctm_in, encoding='utf-8')
except:
sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format(
args.ctm_in))
Expand All @@ -138,7 +138,7 @@ def OpenFiles():
print("get_ctm_edits.py: error: if you set the the --symbol-table option "
"you must also set the --oov option", file = sys.stderr)
try:
f = open(args.symbol_table, 'r')
f = open(args.symbol_table, 'r', encoding='utf-8')
for line in f.readlines():
[ word, integer ] = line.split()
if int(integer) == args.oov:
Expand Down
4 changes: 2 additions & 2 deletions egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

# Copyright 2016 Vimal Manohar
# 2016 Johns Hopkins University (author: Daniel Povey)
Expand Down Expand Up @@ -90,7 +90,7 @@ def read_lang(lang_dir):
raise

try:
for line in open(lang_dir + '/words.txt').readlines():
for line in open(lang_dir + '/words.txt', encoding='utf-8').readlines():
[ word, integer ] = line.split()
if int(integer) in silence_word_ints:
non_scored_words.add(word)
Expand Down
28 changes: 14 additions & 14 deletions egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,22 +75,22 @@ def ReadEntries(file_handle):
# Each entry in the list represents the pronounciation candidate(s) of a word.
# For each non-<eps> word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g:
# [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left",
# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left",
# which includes phones before the first silphone, and "nonsil_right", which includes
# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL',
# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL',
# nonsil_left is 'V' and nonsil_right is empty ''. After processing an <eps> entry
# in ctm_prons, we put it in "info" as an entry: [utt_id, word, nonsil_right]
# only if it's nonsil_right segment is not empty, which may be used when processing
# the next word.
#
#
# Normally, one non-<eps> word is only aligned to one pronounciation candidate. However
# when there is a preceding/following <eps>, like in the following example, we
# assume the phones aligned to <eps> should be statistically distributed
# to its neighboring words (BTW we assume there are no consecutive <eps> within an utterance.)
# Thus we append the "nonsil_left" segment of these phones to the pronounciation
# of the preceding word, if the last phone of this pronounciation is not a silence phone,
# Similarly we can add a pron candidate to the following word.
#
#
# For example, for the following part of a ctm_prons file:
# 911Mothers_2010W-0010916-0012901-1 other AH DH ER
# 911Mothers_2010W-0010916-0012901-1 <eps> K AH N SIL B
Expand All @@ -99,11 +99,11 @@ def ReadEntries(file_handle):
# 911Mothers_2010W-0010916-0012901-1 when W EH N
# 911Mothers_2010W-0010916-0012901-1 people P IY P AH L
# 911Mothers_2010W-0010916-0012901-1 <eps> SIL
# 911Mothers_2010W-0010916-0012901-1 heard HH ER
# 911Mothers_2010W-0010916-0012901-1 heard HH ER
# 911Mothers_2010W-0010916-0012901-1 <eps> D
# 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T
# 911Mothers_2010W-0010916-0012901-1 my M AY
#
#
# The corresponding segment in the "info" list is:
# [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
# [911Mothers_2010W-0010916-0012901-1, <eps>, 'B'
Expand All @@ -113,7 +113,7 @@ def ReadEntries(file_handle):
# [911Mothers_2010W-0010916-0012901-1, <eps>, 'D']
# [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')]
# [911Mothers_2010W-0010916-0012901-1, my, set('M AY')]
#
#
# Then we accumulate pronouciation stats from "info". Basically, for each occurence
# of a word, each pronounciation candidate gets equal soft counts. e.g. In the above
# example, each pron candidate of "because" gets a count of 1/4. The stats is stored
Expand All @@ -139,20 +139,20 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
# So we apply the same merging method in these cases.
if word == '<eps>' or (word in non_scored_words and word != '<unk>' and word != '<UNK>'):
nonsil_left = []
nonsil_right = []
nonsil_right = []
for phone in phones:
if phone in silphones:
break
nonsil_left.append(phone)

for phone in reversed(phones):
if phone in silphones:
break
nonsil_right.insert(0, phone)

# info[-1][0] is the utt_id of the last entry
if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]:
# pron_ext is a set of extended pron candidates.
if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]:
# pron_ext is a set of extended pron candidates.
pron_ext = set()
# info[-1][2] is the set of pron candidates of the last entry.
for pron in info[-1][2]:
Expand Down Expand Up @@ -211,7 +211,7 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
stats[(word, phones)] = stats.get((word, phones), 0) + count
return stats

def WriteStats(stats, file_handle):
def WriteStats(stats, file_handle):
for word_pron, count in stats.items():
print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle)
file_handle.close()
Expand All @@ -222,7 +222,7 @@ def Main():
non_scored_words = ReadEntries(args.non_scored_words_file_handle)
optional_silence = ReadEntries(args.optional_silence_file_handle)
stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle)
WriteStats(stats, args.stats_file_handle)
WriteStats(stats, args.stats_file_handle)

if __name__ == "__main__":
Main()
12 changes: 7 additions & 5 deletions egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

# Copyright 2016 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
Expand Down Expand Up @@ -142,16 +142,18 @@ def CompletelyDiscountLowCountStates(self, min_count):
hist_to_total_count = self.GetHistToTotalCount()
for n in reversed(list(range(2, self.ngram_order))):
this_order_counts = self.counts[n]
to_delete = []
for hist in this_order_counts.keys():
if hist_to_total_count[hist] < min_count:
# we need to completely back off this count.
word_to_count = this_order_counts[hist]
del this_order_counts[hist] # delete the key from the dict.
# mark this key for deleting
to_delete.append(hist)
backoff_hist = hist[1:] # this will be a tuple not a list.
for word, count in word_to_count.items():
self.AddCount(backoff_hist, word, count)


for hist in to_delete:
del this_order_counts[hist]

# This backs off the counts according to Kneser-Ney (unmodified,
# with interpolation).
Expand Down Expand Up @@ -200,7 +202,7 @@ def AddTopWords(self, top_words_file):
word_to_count = self.counts[0][empty_history]
total = sum(word_to_count.values())
try:
f = open(top_words_file)
f = open(top_words_file, mode='r', encoding='utf-8')
except:
sys.exit("make_one_biased_lm.py: error opening top-words file: "
"--top-words=" + top_words_file)
Expand Down
8 changes: 4 additions & 4 deletions egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

# Copyright 2016 Vimal Manohar
# 2016 Johns Hopkins University (author: Daniel Povey)
Expand Down Expand Up @@ -105,7 +105,7 @@
def ReadNonScoredWords(non_scored_words_file):
global non_scored_words
try:
f = open(non_scored_words_file)
f = open(non_scored_words_file, encoding='utf-8')
except:
sys.exit("modify_ctm_edits.py: error opening file: "
"--non-scored-words=" + non_scored_words_file)
Expand Down Expand Up @@ -317,12 +317,12 @@ def ProcessUtterance(split_lines_of_utt):

def ProcessData():
try:
f_in = open(args.ctm_edits_in)
f_in = open(args.ctm_edits_in, encoding='utf-8')
except:
sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
"file {0}".format(args.ctm_edits_in))
try:
f_out = open(args.ctm_edits_out, 'w')
f_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
except:
sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
"file {0}".format(args.ctm_edits_out))
Expand Down
17 changes: 9 additions & 8 deletions egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
#!/usr/bin/env python3


# Copyright 2016 Vimal Manohar
# 2016 Johns Hopkins University (author: Daniel Povey)
Expand Down Expand Up @@ -894,7 +895,7 @@ def AccWordStatsForUtterance(split_lines_of_utt,

def PrintWordStats(word_stats_out):
try:
f = open(word_stats_out, 'w')
f = open(word_stats_out, 'w', encoding='utf-8')
except:
sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} "
"for writing".format(word_stats_out))
Expand Down Expand Up @@ -924,23 +925,23 @@ def PrintWordStats(word_stats_out):

def ProcessData():
try:
f_in = open(args.ctm_edits_in)
f_in = open(args.ctm_edits_in, encoding='utf-8')
except:
sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
"file {0}".format(args.ctm_edits_in))
try:
text_output_handle = open(args.text_out, 'w')
text_output_handle = open(args.text_out, 'w', encoding='utf-8')
except:
sys.exit("segment_ctm_edits.py: error opening text output "
"file {0}".format(args.text_out))
try:
segments_output_handle = open(args.segments_out, 'w')
segments_output_handle = open(args.segments_out, 'w', encoding='utf-8')
except:
sys.exit("segment_ctm_edits.py: error opening segments output "
"file {0}".format(args.text_out))
if args.ctm_edits_out != None:
try:
ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
ctm_edits_output_handle = open(args.ctm_edits_out, 'w', encoding='utf-8')
except:
sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
"file {0}".format(args.ctm_edits_out))
Expand Down Expand Up @@ -994,7 +995,7 @@ def ProcessData():
def ReadNonScoredWords(non_scored_words_file):
global non_scored_words
try:
f = open(non_scored_words_file)
f = open(non_scored_words_file, encoding='utf-8')
except:
sys.exit("segment_ctm_edits.py: error opening file: "
"--non-scored-words=" + non_scored_words_file)
Expand All @@ -1015,7 +1016,7 @@ def ReadNonScoredWords(non_scored_words_file):
oov_symbol = None
if args.oov_symbol_file != None:
try:
with open(args.oov_symbol_file) as f:
with open(args.oov_symbol_file, encoding='utf-8') as f:
line = f.readline()
assert len(line.split()) == 1
oov_symbol = line.split()[0]
Expand Down
16 changes: 9 additions & 7 deletions egs/wsj/s5/steps/cleanup/make_biased_lms.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

from __future__ import print_function
import sys
Expand Down Expand Up @@ -55,21 +55,23 @@ def ProcessGroupOfLines(group_of_lines):
try:
command = "steps/cleanup/internal/make_one_biased_lm.py " + args.lm_opts
p = subprocess.Popen(command, shell = True, stdin = subprocess.PIPE,
stdout = sys.stdout, stderr = sys.stderr)
stdout = sys.stdout, stderr = sys.stderr)
for line in group_of_lines:
a = line.split()
if len(a) == 0:
sys.exit("make_biased_lms.py: empty input line")
utterance_id = a[0]
# print <utt> <utt-group> to utterance-map file
print(utterance_id, group_utterance_id, file = utterance_map_file)
rest_of_line = ' '.join(a[1:]) # get rid of utterance id.
print(rest_of_line, file=p.stdin)
rest_of_line = ' '.join(a[1:]) + '\n' # get rid of utterance id.
p.stdin.write(rest_of_line.encode('utf-8'))
p.stdin.close()
assert p.wait() == 0
except Exception as e:
sys.exit("make_biased_lms.py: error calling subprocess, command was: " +
command + ", error was : " + str(e))
except Exception:
sys.stderr.write(
"make_biased_lms.py: error calling subprocess, command was: " +
command)
raise
# Print a blank line; this terminates the FST in the Kaldi fst-archive
# format.
print("")
Expand Down