Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
352 changes: 0 additions & 352 deletions egs/wsj/s5/steps/cleanup/get_ctm_edits.py

This file was deleted.

90 changes: 0 additions & 90 deletions egs/wsj/s5/steps/cleanup/get_non_scored_words.py

This file was deleted.

15 changes: 12 additions & 3 deletions egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,11 +275,20 @@ def OutputCtm(utterance_id, edits_array, ctm_array):
def ProcessOneUtterance(utterance_id, edits_line, ctm_lines):
try:
# Remove the utterance-id from the beginning of the edits line
edits_line = edits_line[len(utterance_id) + 1:]
edits_fields = edits_line[len(utterance_id) + 1:]

# e.g. if edits_line is now 'i i ; see be ; my my ', edits_array will become
# e.g. if edits_fields is now 'i i ; see be ; my my ', edits_array will become
# [ ['i', 'i'], ['see', 'be'], ['my', 'my'] ]
edits_array = [ x.split() for x in edits_line.split(";") ]
fields_split = edits_fields.split()
first_fields, second_fields = fields_split[0::3], fields_split[1::3]
if (
len(first_fields) != len(second_fields) or
(len(fields_split) >= 3 and set(fields_split[2::3]) != {';'})
):
sys.exit("get_ctm_edits.py: could not make sense of edits line: " + edits_line)

edits_array = list(zip(first_fields, second_fields))

# ctm_array will now become something like [ ['1', '1.010', '0.240', 'little ' ], ... ]
ctm_array = [ x.split() for x in ctm_lines ]
ctm_array = []
Expand Down
12 changes: 6 additions & 6 deletions egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,19 @@ def ReadLang(lang_dir):
global non_scored_words

if not os.path.isdir(lang_dir):
sys.exit("modify_ctm_edits.py expected lang/ directory {0} to "
sys.exit("get_non_scored_words.py expected lang/ directory {0} to "
"exist.".format(lang_dir))
for f in [ '/words.txt', '/phones/silence.int', '/phones/align_lexicon.int' ]:
if not os.path.exists(lang_dir + f):
sys.exit("modify_ctm_edits.py: expected file {0}{1} to exist.".format(
sys.exit("get_non_scored_words.py: expected file {0}{1} to exist.".format(
lang_dir, f))
# read silence-phones.
try:
silence_phones = set()
for line in open(lang_dir + '/phones/silence.int').readlines():
silence_phones.add(int(line))
except Exception as e:
sys.exit("modify_ctm_edits.py: problem reading file "
sys.exit("get_non_scored_words.py: problem reading file "
"{0}/phones/silence.int: {1}".format(lang_dir, str(e)))

# read align_lexicon.int.
Expand All @@ -67,7 +67,7 @@ def ReadLang(lang_dir):
int(a[2]) in silence_phones:
silence_word_ints.add(int(a[0]))
except Exception as e:
sys.exit("modify_ctm_edits.py: problem reading file "
sys.exit("get_non_scored_words.py: problem reading file "
"{0}/phones/align_lexicon.int: "
"{1}".format(lang_dir, str(e)))

Expand All @@ -77,11 +77,11 @@ def ReadLang(lang_dir):
if int(integer) in silence_word_ints:
non_scored_words.add(word)
except Exception as e:
sys.exit("modify_ctm_edits.py: problem reading file "
sys.exit("get_non_scored_words.py: problem reading file "
"{0}/words.txt.int: {1}".format(lang_dir, str(e)))

if not len(non_scored_words) == len(silence_word_ints):
sys.exit("modify_ctm_edits.py: error getting silence words, len({0}) != len({1})",
sys.exit("get_non_scored_words.py: error getting silence words, len({0}) != len({1})",
str(non_scored_words), str(silence_word_ints))
for word in non_scored_words:
print(word)
Expand Down
2 changes: 1 addition & 1 deletion egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

parser = argparse.ArgumentParser(
description = "This program modifies the reference in the ctm-edits which "
"is output by steps/cleanup/get_ctm_edits.py, to allow insertions, deletions and "
"is output by steps/cleanup/internal/get_ctm_edits.py, to allow insertions, deletions and "
"substitutions of non-scored words, and [if --allow-repetitions=true], "
"duplications of single words or pairs of scored words (to account for dysfluencies "
"that were not transcribed). Note: deletions and substitutions of non-scored words "
Expand Down
20 changes: 10 additions & 10 deletions egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,34 +912,34 @@ def ProcessData():
try:
f_in = open(args.ctm_edits_in)
except:
sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
"file {0}".format(args.ctm_edits_in))
try:
text_output_handle = open(args.text_out, 'w')
except:
sys.exit("modify_ctm_edits.py: error opening text output "
sys.exit("segment_ctm_edits.py: error opening text output "
"file {0}".format(args.text_out))
try:
segments_output_handle = open(args.segments_out, 'w')
except:
sys.exit("modify_ctm_edits.py: error opening segments output "
sys.exit("segment_ctm_edits.py: error opening segments output "
"file {0}".format(args.text_out))
if args.ctm_edits_out != None:
try:
ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
except:
sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
"file {0}".format(args.ctm_edits_out))

# Most of what we're doing in the lines below is splitting the input lines
# and grouping them per utterance, before giving them to ProcessUtterance()
# and then printing the modified lines.
first_line = f_in.readline()
if first_line == '':
sys.exit("modify_ctm_edits.py: empty input")
sys.exit("segment_ctm_edits.py: empty input")
split_pending_line = first_line.split()
if len(split_pending_line) == 0:
sys.exit("modify_ctm_edits.py: bad input line " + first_line)
sys.exit("segment_ctm_edits.py: bad input line " + first_line)
cur_utterance = split_pending_line[0]
split_lines_of_cur_utterance = []

Expand All @@ -966,14 +966,14 @@ def ProcessData():
split_pending_line = next_line.split()
if len(split_pending_line) == 0:
if next_line != '':
sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
sys.exit("segment_ctm_edits.py: got an empty or whitespace input line")
try:
text_output_handle.close()
segments_output_handle.close()
if args.ctm_edits_out != None:
ctm_edits_output_handle.close()
except:
sys.exit("modify_ctm_edits.py: error closing one or more outputs "
sys.exit("segment_ctm_edits.py: error closing one or more outputs "
"(broken pipe or full disk?)")


Expand All @@ -982,12 +982,12 @@ def ReadNonScoredWords(non_scored_words_file):
try:
f = open(non_scored_words_file)
except:
sys.exit("modify_ctm_edits.py: error opening file: "
sys.exit("segment_ctm_edits.py: error opening file: "
"--non-scored-words=" + non_scored_words_file)
for line in f.readlines():
a = line.split()
if not len(line.split()) == 1:
sys.exit("modify_ctm_edits.py: bad line in non-scored-words "
sys.exit("segment_ctm_edits.py: bad line in non-scored-words "
"file {0}: {1}".format(non_scored_words_file, line))
non_scored_words.add(a[0])
f.close()
Expand Down
16 changes: 8 additions & 8 deletions egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,12 @@ def ProcessData():
try:
f_in = open(args.ctm_edits_in)
except:
sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
sys.exit("taint_ctm_edits.py: error opening ctm-edits input "
"file {0}".format(args.ctm_edits_in))
try:
f_out = open(args.ctm_edits_out, 'w')
except:
sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
sys.exit("taint_ctm_edits.py: error opening ctm-edits output "
"file {0}".format(args.ctm_edits_out))
num_lines_processed = 0

Expand All @@ -147,10 +147,10 @@ def ProcessData():
# and then printing the modified lines.
first_line = f_in.readline()
if first_line == '':
sys.exit("modify_ctm_edits.py: empty input")
sys.exit("taint_ctm_edits.py: empty input")
split_pending_line = first_line.split()
if len(split_pending_line) == 0:
sys.exit("modify_ctm_edits.py: bad input line " + first_line)
sys.exit("taint_ctm_edits.py: bad input line " + first_line)
cur_utterance = split_pending_line[0]
split_lines_of_cur_utterance = []

Expand All @@ -170,7 +170,7 @@ def ProcessData():
split_pending_line = next_line.split()
if len(split_pending_line) == 0:
if next_line != '':
sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
sys.exit("taint_ctm_edits.py: got an empty or whitespace input line")
try:
f_out.close()
except:
Expand All @@ -181,13 +181,13 @@ def PrintNonScoredStats():
if args.verbose < 1:
return
if num_lines == 0:
print("modify_ctm_edits.py: processed no input.", file = sys.stderr)
print("taint_ctm_edits.py: processed no input.", file = sys.stderr)
num_lines_modified = sum(ref_change_stats.values())
num_incorrect_lines = num_lines - num_correct_lines
percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines)
print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
print("taint_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
"of which {2} were changed fixing the reference for non-scored words "
"({3}% of lines, or {4}% of incorrect lines)".format(
num_lines, percent_lines_incorrect, num_lines_modified,
Expand All @@ -198,7 +198,7 @@ def PrintNonScoredStats():
key = lambda x: ref_change_stats[x])
num_keys_to_print = 40 if args.verbose >= 2 else 10

print("modify_ctm_edits.py: most common edits (as percentages "
print("taint_ctm_edits.py: most common edits (as percentages "
"of all such edits) are:\n" +
('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified)
for k in keys[0:num_keys_to_print]]))
Expand Down
2 changes: 1 addition & 1 deletion egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ if [ $stage -le 5 ]; then

$cmd $dir/log/get_ctm_edits.log \
align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:- \| \
steps/cleanup/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
/dev/stdin $dir/ctm $dir/ctm_edits || exit 1

echo "$0: ctm with edits information appended is in $dir/ctm_edits"
Expand Down
Loading