diff --git a/egs/wsj/s5/steps/cleanup/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/get_ctm_edits.py deleted file mode 100755 index ee75cfb4dfe..00000000000 --- a/egs/wsj/s5/steps/cleanup/get_ctm_edits.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2016 Vimal Manohar -# 2016 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -from __future__ import print_function -import sys, operator, argparse - -# Modify the CTM to include for each token the information from Levenshtein -# alignment of 'hypothesis' and 'reference' -# (i.e. the output of 'align-text'. - -# The information added to each token in the CTM is the reference word and one -# of the following edit-types: -# 'cor' = correct [note: as a special case we count as correct cases where -# the hypothesis word is the OOV symbol and the reference -# word is OOV w.r.t. the supplied vocabulary.] -# 'sub' = substitution -# 'del' = deletion -# 'ins' = insertion -# 'sil' = (silence in ctm; does not consume a reference word) -# note: the script modify_ctm_edits.py will add the new -# note: the following extra edit-type may be added by modify_ctm_edits.py: -# 'fix' ... this is like 'cor', but it means the reference has been modified -# to fix non-scoreable errors [typically errors that don't change the -# meaning], so we don't trust the word or value it as much as a 'cor'. -# - -# Note: Additional lines are added to the CTM to account for deletions. - -# Input CTM: -# (note: the is for silence in the input CTM that comes from -# optional-silence in the graph. However, the input edits don't have anything -# for these silences. -# We assume (and check) that the channel will always be '1', because the -# input CTMs are expected to be 'per utterance', not including real -# recording-ids. - -# Input ctm format: -# [] -# note, the confidence defaults to 1 if not provided (these -# scripts don't actually use the confidence field). - -## TimBrown_2008P-0007226-0007620 1 0.000 0.100 when -## TimBrown_2008P-0007226-0007620 1 0.100 0.090 i -## TimBrown_2008P-0007226-0007620 1 0.190 0.300 some -## TimBrown_2008P-0007226-0007620 1 0.490 0.110 when -## TimBrown_2008P-0007226-0007620 1 0.600 0.060 i -## TimBrown_2008P-0007226-0007620 1 0.660 0.190 say -## TimBrown_2008P-0007226-0007620 1 0.850 0.450 go -## TimBrown_2008P-0007226-0007620 1 1.300 0.310 [COUGH] -## TimBrown_2008P-0007226-0007620 1 1.610 0.130 you -## TimBrown_2008P-0007226-0007620 1 1.740 0.180 got -## TimBrown_2008P-0007226-0007620 1 1.920 0.370 thirty -## TimBrown_2008P-0007226-0007620 1 2.290 0.830 seconds -## TimBrown_2008P-0007226-0007620 1 3.120 0.330 -## TimBrown_2008P-0007226-0007620 1 3.450 0.040 [BREATH] -## TimBrown_2008P-0007226-0007620 1 3.490 0.110 to -## TimBrown_2008P-0007226-0007620 1 3.600 0.320 [NOISE] - -# Input Levenshtein edits : (the output of 'align-text' post-processed by 'wer_per_utt_details.pl') - -# AJJacobs_2007P-0001605-0003029 i i ; thought thought ; i'd i'd ; tell tell ; you you ; a a ; little little ; about about ; [UH] [UH] ; what what ; i i ; like like ; to to ; write write ; and and ; [UH] [UH] ; i i ; like like ; to to ; [UH] [UH] ; immerse immerse ; myself myself ; [SMACK] [SMACK] ; in in ; my my ; topics topics ; [UM] [UM] ; i i ; just just ; like like ; to to ; [UH] [UH] ; dive dive ; [SMACK] [SMACK] ; right right ; in in ; and and ; become become ; [UH] [UH] ; sort sort ; of of ; a a ; human human ; guinea guinea ; pig pig ; [BREATH] [BREATH] ; and and ; [UH] [UH] -# AJJacobs_2007P-0003133-0004110 i i ; see see ; my my ; life life ; as as ; a a ; series series ; of of ; experiments experiments ; [BREATH] [BREATH] ; so so ; [UH] [UH] ; i i ; [NOISE] [NOISE] ; work work ; for for ; esquire esquire ; magazine magazine ; and ; a a ; couple couple ; of of ; years years ; ago ago ; [BREATH] [BREATH] ; i i ; wrote wrote ; an an ; article article ; called called ; [NOISE] [NOISE] ; my my ; outsourced outsourced ; life life - - -# Output format: -# - -# AJJacobs_2007P-0001605-0003029 1 0 0.09 1.0 sil -# AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor -# AJJacobs_2007P-0001605-0003029 1 0.24 0.25 thought 1.0 thought cor -# AJJacobs_2007P-0001605-0003029 1 0.49 0.14 i'd 1.0 i'd cor -# AJJacobs_2007P-0001605-0003029 1 0.63 0.22 tell 1.0 tell cor -# AJJacobs_2007P-0001605-0003029 1 0.85 0.11 you 1.0 you cor -# AJJacobs_2007P-0001605-0003029 1 0.96 0.05 a 1.0 a cor -# AJJacobs_2007P-0001605-0003029 1 1.01 0.24 little 1.0 little cor -# AJJacobs_2007P-0001605-0003029 1 1.25 0.5 about 1.0 about cor -# AJJacobs_2007P-0001605-0003029 1 1.75 0.48 [UH] 1.0 [UH] cor -# AJJacobs_2007P-0001605-0003029 1 2.23 0.34 1.0 sil -# AJJacobs_2007P-0001605-0003029 1 2.57 0.21 what 1.0 what cor -# AJJacobs_2007P-0001605-0003029 1 2.78 0.1 i 1.0 i cor -# AJJacobs_2007P-0001605-0003029 1 2.88 0.22 like 1.0 like cor -# AJJacobs_2007P-0001605-0003029 1 3.1 0.13 to 1.0 to cor -# AJJacobs_2007P-0001605-0003029 1 3.23 0.37 write 1.0 write cor -# AJJacobs_2007P-0001605-0003029 1 3.6 0.03 1.0 sil -# AJJacobs_2007P-0001605-0003029 1 3.63 0.36 and 1.0 and cor - - - -parser = argparse.ArgumentParser( - description = "Append to the CTM the Levenshtein alignment of 'hypothesis' and 'reference'; " - "creates augmented CTM with extra fields (see script for details)") - -parser.add_argument("--oov", type = int, default = -1, - help = "The integer representation of the OOV symbol; substitutions " - "by the OOV symbol for out-of-vocabulary reference words are treated " - "as correct, if you also supply the --symbol-table option.") -parser.add_argument("--symbol-table", type = str, - help = "The words.txt your system used; if supplied, it is used to " - "determine OOV words (and such words will count as correct if " - "substituted by the OOV symbol). See also the --oov option") -# Required arguments -parser.add_argument("edits_in", metavar = "", - help = "Filename of output of 'align-text', which this program reads. " - "Use /dev/stdin for standard input.") -parser.add_argument("ctm_in", metavar = "", - help = "Filename of input hypothesis in ctm format") -parser.add_argument("ctm_edits_out", metavar = "", - help = "Filename of output (CTM appended with word-edit information)") -args = parser.parse_args() - - - -def OpenFiles(): - global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word - try: - ctm_edits_out = open(args.ctm_edits_out, 'w') - except: - sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format( - args.ctm_edits_out)) - try: - edits_in = open(args.edits_in) - except: - sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format( - args.edits_in)) - try: - ctm_in = open(args.ctm_in) - except: - sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format( - args.ctm_in)) - - symbol_table = set() - oov_word = None - if args.symbol_table != None: - if args.oov == -1: - print("get_ctm_edits.py: error: if you set the the --symbol-table option " - "you must also set the --oov option", file = sys.stderr) - try: - f = open(args.symbol_table, 'r') - for line in f.readlines(): - [ word, integer ] = line.split() - if int(integer) == args.oov: - oov_word = word - symbol_table.add(word) - except: - sys.exit("get_ctm_edits.py: error opening symbol-table file {0} for " - "input (or bad file), exception is: {1}".format(args.symbol_table)) - f.close() - if oov_word == None: - sys.exit("get_ctm_edits.py: OOV word not found: check the values of " - "--symbol-table={0} and --oov={1}".format(args.symbol_table, - args.oov)) - -# This function takes two lists -# edits_array = [ [ hyp_word1, ref_word1], [ hyp_word2, ref_word2 ], ... ] -# ctm_array = [ [ start1, duration1, hyp_word1, confidence1 ], ... ] -# -# and pads them with new list elements so that the entries 'match up'. What we -# are aiming for is that for each i, ctm_array[i][2] == edits_array[i][0]. The -# reasons why this is not automatically true are: -# -# (1) There may be deletions in the hypothesis sequence, which would lead to -# pairs like [ '', ref_word ]. -# (2) The ctm may have been written 'with silence', which will lead to -# ctm entries like [ 1, 7.8, 0.9, '' ] where the '' refers -# to the optional-silence from the lexicon. -# -# We introduce suitable entries in to edits_array and ctm_array as necessary -# to make them 'match up'. This function returns the pair (new_edits_array, -# new_ctm_array). -def PadArrays(edits_array, ctm_array): - new_edits_array = [] - new_ctm_array = [] - edits_len = len(edits_array) - ctm_len = len(ctm_array) - edits_pos = 0 - ctm_pos = 0 - # current_time is the end of the last ctm segment we processesed. - current_time = ctm_array[0][0] if ctm_len > 0 else 0.0 - while edits_pos < edits_len or ctm_pos < ctm_len: - if edits_pos < edits_len and ctm_pos < ctm_len and \ - edits_array[edits_pos][0] == ctm_array[ctm_pos][2] and \ - edits_array[edits_pos][0] != '': - # This is the normal case, where there are 2 entries where - # they hyp-words match up - new_edits_array.append(edits_array[edits_pos]) - edits_pos += 1 - new_ctm_array.append(ctm_array[ctm_pos]) - current_time = ctm_array[ctm_pos][0] + ctm_array[ctm_pos][1] - ctm_pos += 1 - elif edits_pos < edits_len and edits_array[edits_pos][0] == '': - # There was a deletion. Pad with an empty ctm segment with '' as - # the word. - new_edits_array.append(edits_array[edits_pos]) - edits_pos += 1 - duration = 0.0 - confidence = 1.0 - new_ctm_array.append([ current_time, duration, '', confidence]) - elif ctm_pos < ctm_len and ctm_array[ctm_pos][2] == '': - # There was silence in the ctm, and either we're reached the end of the - # edits sequence, or the hyp word was not '': - - new_edits_array.append(['', '']) - new_ctm_array.append(ctm_array[ctm_pos]) - current_time = ctm_array[ctm_pos][0] + ctm_array[ctm_pos][1] - ctm_pos += 1 - else: - raise Exception("Could not align edits_array = {0} and ctm_array = {1}; " - "edits-position = {2}, ctm-position = {3}, " - "pending-edit={4}, pending-ctm-entry={5}".format( - edits_array, ctm_array, edits_pos, ctm_pos, - edits_array[edits_pos] if edits_pos < edits_len else None, - ctm_array[ctm_pos] if ctm_pos < ctm_len else None)) - assert len(new_edits_array) == len(new_ctm_array) - return (new_edits_array, new_ctm_array) - - -# This function returns the appropriate edit-type to output in the ctm-edits -# file. The ref_word and hyp_word and duration are the values we'll print in -# the ctm-edits file. -def GetEditType(hyp_word, ref_word, duration): - global oov_word - if hyp_word == ref_word and hyp_word !='': - return 'cor' - elif hyp_word != '' and ref_word == '': - return 'ins' - elif hyp_word == '' and ref_word != '' and duration == 0.0: - return 'del' - elif hyp_word == oov_word and \ - len(symbol_table) != 0 and not ref_word in symbol_table: - return 'cor' # this special case is treated as correct. - elif hyp_word == '' == ref_word and duration > 0.0: - # silence in hypothesis; we don't match this up with any reference word. - return 'sil' - else: - # The following assertion is because, based on how PadArrays - # works, we shouldn't hit this case. - assert hyp_word != '' and ref_word != '' - return 'sub' - -# this prints a number with a certain number of digits after -# the point, while removing trailing zeros. -def FloatToString(f): - num_digits = 6 # we want to print 6 digits after the zero - g = f - while abs(g) > 1.0: - g *= 0.1 - num_digits += 1 - format_str = '%.{0}g'.format(num_digits) - return format_str % f - - -def OutputCtm(utterance_id, edits_array, ctm_array): - global ctm_edits_out - # note: this function expects the padded entries created by PadARrays. - assert len(edits_array) == len(ctm_array) - channel = '1' # this is hardcoded at both input and output, since this CTM - # doesn't really represent recordings, only utterances. - for i in range(len(edits_array)): - ( hyp_word, ref_word ) = edits_array[i] - ( start_time, duration, hyp_word2, confidence ) = ctm_array[i] - if not hyp_word == hyp_word2: - print("Error producing output CTM for edit = {0} and ctm = {1}".format( - edits_array[i], ctm_array[i]), file = sys.stderr) - sys.exit(1) - assert hyp_word == hyp_word2 - edit_type = GetEditType(hyp_word, ref_word, duration) - print(utterance_id, channel, FloatToString(start_time), - FloatToString(duration), hyp_word, confidence, ref_word, - edit_type, file = ctm_edits_out) - - -def ProcessOneUtterance(utterance_id, edits_line, ctm_lines): - try: - # Remove the utterance-id from the beginning of the edits line - edits_fields = edits_line[len(utterance_id) + 1:] - - # e.g. if edits_fields is now 'i i ; see be ; my my ', edits_array will become - # [ ['i', 'i'], ['see', 'be'], ['my', 'my'] ] - edits_array = [ x.split() for x in edits_fields.split(";") ] - # The lines below are a fix for when we get empty transcripts and reference, hence - # just whitespace in 'edits_fields'. - if edits_array == [[]]: - edits_array = [] - try: - for x in edits_array: - assert len(x) == 2 - except: - sys.exit("get_ctm_edits.py: could not make sense of edits line: " + edits_line) - - # ctm_array will now become something like [ ['1', '1.010', '0.240', 'little ' ], ... ] - ctm_array = [ x.split() for x in ctm_lines ] - ctm_array = [] - for line in ctm_lines: - try: - # Strip off the utterance-id and split the remaining fields - # which should be: channel==1, start, dur, word, [confidence] - a = line[len(utterance_id) + 1:].split() - if len(a) == 4: - a.append(1.0) # confidence defaults to 1.0. - [ channel, start, dur, word, confidence ] = a - if channel != '1': - raise Exception("Channel should be 1, got: " + channel) - ctm_array.append([ float(start), float(dur), word, float(confidence) ]) - except Exception as e: - sys.exit("get_ctm_edits.py: error procesing ctm line {0} " - "... exception is: {1} {2}".format(line, type(e), str(e))) - # ctm_array will now be something like [ [ 1.010, 0.240, 'little ', 1.0 ], ... ] - - # The following call pads the edits and ctm arrays with appropriate - # entries so that they have the same length and the elements 'match up'. - (edits_array, ctm_array) = PadArrays(edits_array, ctm_array) - except Exception as e: - sys.exit("get_ctm_edits.py: error processing utterance {0}, error was: {1}".format( - utterance_id, str(e))) - OutputCtm(utterance_id, edits_array, ctm_array) - -def ProcessData(): - num_utterances_processed = 0 - - pending_ctm_line = ctm_in.readline() - - while True: - this_edits_line = edits_in.readline() - if this_edits_line == '': - if pending_ctm_line != '': - sys.exit("get_ctm_edits.py: edits_in input {0} ended before " - "ctm input was ended. We processed {1} " - "utterances.".format(args.edits_in, num_utterances_processed)) - break - a = this_edits_line.split() - if len(a) == 0: - sys.exit("get_ctm_edits.py: edits_input {0} had an empty line".format( - args.edits_in)) - utterance_id = a[0] - utterance_id_len = len(utterance_id) - this_utterance_ctm_lines = [] - while pending_ctm_line[0:utterance_id_len] == utterance_id: - this_utterance_ctm_lines.append(pending_ctm_line) - pending_ctm_line = ctm_in.readline() - ProcessOneUtterance(utterance_id, this_edits_line, - this_utterance_ctm_lines) - num_utterances_processed += 1 - print("get_ctm_edits.py: processed {0} utterances".format( - num_utterances_processed), file=sys.stderr) - - -OpenFiles() -ProcessData() - diff --git a/egs/wsj/s5/steps/cleanup/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/get_non_scored_words.py deleted file mode 100755 index 2ffdf3b7f94..00000000000 --- a/egs/wsj/s5/steps/cleanup/get_non_scored_words.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2016 Vimal Manohar -# 2016 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -from __future__ import print_function -import sys, operator, argparse, os -from collections import defaultdict - -# If you supply the directory (the one that corresponds to -# how you decoded the data) to this script, it assumes that the -# directory contains phones/align_lexicon.int, and it uses this to work -# out a reasonable guess of the non-scored phones, based on which have -# a single-word pronunciation that maps to a silence phone. -# It then uses the words.txt to work out the written form of those words. - -parser = argparse.ArgumentParser( - description = "This program works out a reasonable guess at a list of " - "non-scored words (words that won't affect the WER evaluation): " - "things like [COUGH], [NOISE] and so on. This is useful because a list of " - "such words is required by some other scripts (e.g. modify_ctm_edits.py), " - "and it's inconvenient to have to specify the list manually for each language. " - "This program writes out the words in text form, one per line.") - -parser.add_argument("lang", type = str, - help = "The lang/ directory. This program expects " - "lang/words.txt and lang/phones/silence.int and " - "lang/phones/align_lexicon.int to exist, and will use them to work " - "out a reasonable guess of the non-scored words (as those whose " - "pronunciations are a single phone in the 'silphones' list)") - -args = parser.parse_args() - -non_scored_words = set() - - -def ReadLang(lang_dir): - global non_scored_words - - if not os.path.isdir(lang_dir): - sys.exit("modify_ctm_edits.py expected lang/ directory {0} to " - "exist.".format(lang_dir)) - for f in [ '/words.txt', '/phones/silence.int', '/phones/align_lexicon.int' ]: - if not os.path.exists(lang_dir + f): - sys.exit("modify_ctm_edits.py: expected file {0}{1} to exist.".format( - lang_dir, f)) - # read silence-phones. - try: - silence_phones = set() - for line in open(lang_dir + '/phones/silence.int').readlines(): - silence_phones.add(int(line)) - except Exception as e: - sys.exit("modify_ctm_edits.py: problem reading file " - "{0}/phones/silence.int: {1}".format(lang_dir, str(e))) - - # read align_lexicon.int. - # format is: .. - # We're looking for line of the form: - # w w p - # where w > 0 and p is in the set 'silence_phones' - try: - silence_word_ints = set() - for line in open(lang_dir + '/phones/align_lexicon.int').readlines(): - a = line.split() - if len(a) == 3 and a[0] == a[1] and int(a[0]) > 0 and \ - int(a[2]) in silence_phones: - silence_word_ints.add(int(a[0])) - except Exception as e: - sys.exit("modify_ctm_edits.py: problem reading file " - "{0}/phones/align_lexicon.int: " - "{1}".format(lang_dir, str(e))) - - try: - for line in open(lang_dir + '/words.txt').readlines(): - [ word, integer ] = line.split() - if int(integer) in silence_word_ints: - non_scored_words.add(word) - except Exception as e: - sys.exit("modify_ctm_edits.py: problem reading file " - "{0}/words.txt.int: {1}".format(lang_dir, str(e))) - - if not len(non_scored_words) == len(silence_word_ints): - sys.exit("modify_ctm_edits.py: error getting silence words, len({0}) != len({1})", - str(non_scored_words), str(silence_word_ints)) - for word in non_scored_words: - print(word) - - -ReadLang(args.lang) diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py index 385b0c5c5dd..d0f762d1197 100755 --- a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py @@ -275,11 +275,20 @@ def OutputCtm(utterance_id, edits_array, ctm_array): def ProcessOneUtterance(utterance_id, edits_line, ctm_lines): try: # Remove the utterance-id from the beginning of the edits line - edits_line = edits_line[len(utterance_id) + 1:] + edits_fields = edits_line[len(utterance_id) + 1:] - # e.g. if edits_line is now 'i i ; see be ; my my ', edits_array will become + # e.g. if edits_fields is now 'i i ; see be ; my my ', edits_array will become # [ ['i', 'i'], ['see', 'be'], ['my', 'my'] ] - edits_array = [ x.split() for x in edits_line.split(";") ] + fields_split = edits_fields.split() + first_fields, second_fields = fields_split[0::3], fields_split[1::3] + if ( + len(first_fields) != len(second_fields) or + (len(fields_split) >= 3 and set(fields_split[2::3]) != {';'}) + ): + sys.exit("get_ctm_edits.py: could not make sense of edits line: " + edits_line) + + edits_array = list(zip(first_fields, second_fields)) + # ctm_array will now become something like [ ['1', '1.010', '0.240', 'little ' ], ... ] ctm_array = [ x.split() for x in ctm_lines ] ctm_array = [] diff --git a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py index 2ffdf3b7f94..84d1ca0fbf6 100755 --- a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py +++ b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py @@ -39,11 +39,11 @@ def ReadLang(lang_dir): global non_scored_words if not os.path.isdir(lang_dir): - sys.exit("modify_ctm_edits.py expected lang/ directory {0} to " + sys.exit("get_non_scored_words.py expected lang/ directory {0} to " "exist.".format(lang_dir)) for f in [ '/words.txt', '/phones/silence.int', '/phones/align_lexicon.int' ]: if not os.path.exists(lang_dir + f): - sys.exit("modify_ctm_edits.py: expected file {0}{1} to exist.".format( + sys.exit("get_non_scored_words.py: expected file {0}{1} to exist.".format( lang_dir, f)) # read silence-phones. try: @@ -51,7 +51,7 @@ def ReadLang(lang_dir): for line in open(lang_dir + '/phones/silence.int').readlines(): silence_phones.add(int(line)) except Exception as e: - sys.exit("modify_ctm_edits.py: problem reading file " + sys.exit("get_non_scored_words.py: problem reading file " "{0}/phones/silence.int: {1}".format(lang_dir, str(e))) # read align_lexicon.int. @@ -67,7 +67,7 @@ def ReadLang(lang_dir): int(a[2]) in silence_phones: silence_word_ints.add(int(a[0])) except Exception as e: - sys.exit("modify_ctm_edits.py: problem reading file " + sys.exit("get_non_scored_words.py: problem reading file " "{0}/phones/align_lexicon.int: " "{1}".format(lang_dir, str(e))) @@ -77,11 +77,11 @@ def ReadLang(lang_dir): if int(integer) in silence_word_ints: non_scored_words.add(word) except Exception as e: - sys.exit("modify_ctm_edits.py: problem reading file " + sys.exit("get_non_scored_words.py: problem reading file " "{0}/words.txt.int: {1}".format(lang_dir, str(e))) if not len(non_scored_words) == len(silence_word_ints): - sys.exit("modify_ctm_edits.py: error getting silence words, len({0}) != len({1})", + sys.exit("get_non_scored_words.py: error getting silence words, len({0}) != len({1})", str(non_scored_words), str(silence_word_ints)) for word in non_scored_words: print(word) diff --git a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py index 1022196a456..ea56219fe2a 100755 --- a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py @@ -52,7 +52,7 @@ parser = argparse.ArgumentParser( description = "This program modifies the reference in the ctm-edits which " - "is output by steps/cleanup/get_ctm_edits.py, to allow insertions, deletions and " + "is output by steps/cleanup/internal/get_ctm_edits.py, to allow insertions, deletions and " "substitutions of non-scored words, and [if --allow-repetitions=true], " "duplications of single words or pairs of scored words (to account for dysfluencies " "that were not transcribed). Note: deletions and substitutions of non-scored words " diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py index 7901aca0320..57e9d6ab959 100755 --- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py @@ -912,23 +912,23 @@ def ProcessData(): try: f_in = open(args.ctm_edits_in) except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits input " + sys.exit("segment_ctm_edits.py: error opening ctm-edits input " "file {0}".format(args.ctm_edits_in)) try: text_output_handle = open(args.text_out, 'w') except: - sys.exit("modify_ctm_edits.py: error opening text output " + sys.exit("segment_ctm_edits.py: error opening text output " "file {0}".format(args.text_out)) try: segments_output_handle = open(args.segments_out, 'w') except: - sys.exit("modify_ctm_edits.py: error opening segments output " + sys.exit("segment_ctm_edits.py: error opening segments output " "file {0}".format(args.text_out)) if args.ctm_edits_out != None: try: ctm_edits_output_handle = open(args.ctm_edits_out, 'w') except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits output " + sys.exit("segment_ctm_edits.py: error opening ctm-edits output " "file {0}".format(args.ctm_edits_out)) # Most of what we're doing in the lines below is splitting the input lines @@ -936,10 +936,10 @@ def ProcessData(): # and then printing the modified lines. first_line = f_in.readline() if first_line == '': - sys.exit("modify_ctm_edits.py: empty input") + sys.exit("segment_ctm_edits.py: empty input") split_pending_line = first_line.split() if len(split_pending_line) == 0: - sys.exit("modify_ctm_edits.py: bad input line " + first_line) + sys.exit("segment_ctm_edits.py: bad input line " + first_line) cur_utterance = split_pending_line[0] split_lines_of_cur_utterance = [] @@ -966,14 +966,14 @@ def ProcessData(): split_pending_line = next_line.split() if len(split_pending_line) == 0: if next_line != '': - sys.exit("modify_ctm_edits.py: got an empty or whitespace input line") + sys.exit("segment_ctm_edits.py: got an empty or whitespace input line") try: text_output_handle.close() segments_output_handle.close() if args.ctm_edits_out != None: ctm_edits_output_handle.close() except: - sys.exit("modify_ctm_edits.py: error closing one or more outputs " + sys.exit("segment_ctm_edits.py: error closing one or more outputs " "(broken pipe or full disk?)") @@ -982,12 +982,12 @@ def ReadNonScoredWords(non_scored_words_file): try: f = open(non_scored_words_file) except: - sys.exit("modify_ctm_edits.py: error opening file: " + sys.exit("segment_ctm_edits.py: error opening file: " "--non-scored-words=" + non_scored_words_file) for line in f.readlines(): a = line.split() if not len(line.split()) == 1: - sys.exit("modify_ctm_edits.py: bad line in non-scored-words " + sys.exit("segment_ctm_edits.py: bad line in non-scored-words " "file {0}: {1}".format(non_scored_words_file, line)) non_scored_words.add(a[0]) f.close() diff --git a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py index c763d7191a1..2230a10aee2 100755 --- a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py @@ -132,12 +132,12 @@ def ProcessData(): try: f_in = open(args.ctm_edits_in) except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits input " + sys.exit("taint_ctm_edits.py: error opening ctm-edits input " "file {0}".format(args.ctm_edits_in)) try: f_out = open(args.ctm_edits_out, 'w') except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits output " + sys.exit("taint_ctm_edits.py: error opening ctm-edits output " "file {0}".format(args.ctm_edits_out)) num_lines_processed = 0 @@ -147,10 +147,10 @@ def ProcessData(): # and then printing the modified lines. first_line = f_in.readline() if first_line == '': - sys.exit("modify_ctm_edits.py: empty input") + sys.exit("taint_ctm_edits.py: empty input") split_pending_line = first_line.split() if len(split_pending_line) == 0: - sys.exit("modify_ctm_edits.py: bad input line " + first_line) + sys.exit("taint_ctm_edits.py: bad input line " + first_line) cur_utterance = split_pending_line[0] split_lines_of_cur_utterance = [] @@ -170,7 +170,7 @@ def ProcessData(): split_pending_line = next_line.split() if len(split_pending_line) == 0: if next_line != '': - sys.exit("modify_ctm_edits.py: got an empty or whitespace input line") + sys.exit("taint_ctm_edits.py: got an empty or whitespace input line") try: f_out.close() except: @@ -181,13 +181,13 @@ def PrintNonScoredStats(): if args.verbose < 1: return if num_lines == 0: - print("modify_ctm_edits.py: processed no input.", file = sys.stderr) + print("taint_ctm_edits.py: processed no input.", file = sys.stderr) num_lines_modified = sum(ref_change_stats.values()) num_incorrect_lines = num_lines - num_correct_lines percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines) percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines); percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines) - print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), " + print("taint_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), " "of which {2} were changed fixing the reference for non-scored words " "({3}% of lines, or {4}% of incorrect lines)".format( num_lines, percent_lines_incorrect, num_lines_modified, @@ -198,7 +198,7 @@ def PrintNonScoredStats(): key = lambda x: ref_change_stats[x]) num_keys_to_print = 40 if args.verbose >= 2 else 10 - print("modify_ctm_edits.py: most common edits (as percentages " + print("taint_ctm_edits.py: most common edits (as percentages " "of all such edits) are:\n" + ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified) for k in keys[0:num_keys_to_print]])) diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh index 80b4739a629..dc8cd7d3deb 100755 --- a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh +++ b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh @@ -192,7 +192,7 @@ if [ $stage -le 5 ]; then $cmd $dir/log/get_ctm_edits.log \ align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:- \| \ - steps/cleanup/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \ + steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \ /dev/stdin $dir/ctm $dir/ctm_edits || exit 1 echo "$0: ctm with edits information appended is in $dir/ctm_edits" diff --git a/egs/wsj/s5/steps/cleanup/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/make_one_biased_lm.py deleted file mode 100755 index 2e0bc1fcda1..00000000000 --- a/egs/wsj/s5/steps/cleanup/make_one_biased_lm.py +++ /dev/null @@ -1,310 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function -import sys -import argparse -import math -from collections import defaultdict - -parser = argparse.ArgumentParser(description=""" -This script creates a biased language model suitable for alignment and -data-cleanup purposes. It reads (possibly multiple) lines of integerized text -from the input and writes a text-form FST of a backoff language model to -the standard output, to be piped into fstcompile.""") - -parser.add_argument("--word-disambig-symbol", type = int, required = True, - help = "Integer corresponding to the disambiguation " - "symbol (normally #0) for backoff arcs") -parser.add_argument("--ngram-order", type = int, default = 4, - choices = [2,3,4,5,6,7], - help = "Maximum order of n-gram to use (but see also " - "--min-lm-state-count; the effective order may be less.") -parser.add_argument("--min-lm-state-count", type = int, default = 10, - help = "Minimum count below which we will completely " - "discount an LM-state (if it is of order > 2, i.e. " - "history-length > 1).") -parser.add_argument("--top-words", type = str, - help = "File containing frequent words and probabilities to be added into " - "the language model, with lines in the format ' '. " - "These probabilities will be added to the probabilities in the unigram " - "backoff state and then renormalized; this option allows you to introduce " - "common words to the LM with specified probabilities.") -parser.add_argument("--discounting-constant", type = float, default = 0.3, - help = "Discounting constant D for standard (unmodified) Kneser-Ney; " - "must be strictly between 0 and 1. A value closer to 0 will give " - "you a more-strongly-biased LM.") -parser.add_argument("--verbose", type = int, default = 0, - choices=[0,1,2,3,4,5], help = "Verbose level") - -args = parser.parse_args() - -if args.verbose >= 1: - print(' '.join(sys.argv), file = sys.stderr) - - - - -class NgramCounts: - ## A note on data-structure. - ## Firstly, all words are represented as integers. - ## We store n-gram counts as an array, indexed by (history-length == n-gram order minus one) - ## (note: python calls arrays "lists") of dicts from histories to counts, where - ## histories are arrays of integers and "counts" are dicts from integer to float. - ## For instance, when accumulating the 4-gram count for the 'd' in the sequence '5 6 7 8', - ## we'd do as follows: - ## self.counts[3][[5,6,7]][8] += 1.0 - ## where the [3] indexes an array, the [[5,6,7]] indexes a dict, and - ## the [8] indexes a dict. - def __init__(self, ngram_order): - self.ngram_order = ngram_order - # Integerized counts will never contain negative numbers, so - # inside this program, we use -1 and -2 for the BOS and EOS symbols - # respectively. - # Note: it's actually important that the bos-symbol is the most negative; - # it helps ensure that we print the state with left-context first - # when we print the FST, and this means that the start-state will have - # the correct value. - self.bos_symbol = -3 - self.eos_symbol = -2 - # backoff_symbol is kind of a pseudo-word, it's used in keeping track of - # the backoff counts in each state. - self.backoff_symbol = -1 - self.counts = [] - for n in range(ngram_order): - # The 'lambda: defaultdict(float)' is an anonymous function taking - # no arguments that returns a new defaultdict(float). - # If we index self.counts[n][history] for a history-length n < ngram_order - # and a previously unseen history, it will create a new defaultdict - # that defaults to 0.0 [since the function float() will return 0.0]. - # This means that we can index self.counts without worrying about - # undefined values. - self.counts.append(defaultdict(lambda: defaultdict(float))) - - # adds a raw count (called while processing input data). - # Suppose we see the sequence '6 7 8 9' and ngram_order=4, 'history' - # would be (6,7,8) and 'predicted_word' would be 9; 'count' would be - # 1.0. - def AddCount(self, history, predicted_word, count): - self.counts[len(history)][history][predicted_word] += count - - # 'line' is a string containing a sequence of integer word-ids. - # This function adds the un-smoothed counts from this line of text. - def AddRawCountsFromLine(self, line): - try: - words = [self.bos_symbol] + [ int(x) for x in line.split() ] + [self.eos_symbol] - except: - sys.exit("make_one_biased_lm.py: bad input line {0} (expected a sequence " - "of integers)".format(line)) - - for n in range(1, len(words)): - predicted_word = words[n] - history_start = max(0, n + 1 - self.ngram_order) - history = tuple(words[history_start:n]) - self.AddCount(history, predicted_word, 1.0) - - def AddRawCountsFromStandardInput(self): - lines_processed = 0 - while True: - line = sys.stdin.readline() - if line == '': - break - self.AddRawCountsFromLine(line) - lines_processed += 1 - if lines_processed == 0 or args.verbose > 0: - print("make_one_biased_lm.py: processed {0} lines of input".format( - lines_processed), file = sys.stderr) - - - # This function returns a dict from history (as a tuple of integers of - # length > 1, ignoring lower-order histories), to the total count of this - # history state plus all history-states which back off to this history state. - # It's used inside CompletelyDiscountLowCountStates(). - def GetHistToTotalCount(self): - ans = defaultdict(float) - for n in range(2, self.ngram_order): - for hist, word_to_count in self.counts[n].items(): - total_count = sum(word_to_count.values()) - while len(hist) >= 2: - ans[hist] += total_count - hist = hist[1:] - return ans - - - # This function will completely discount the counts in any LM-states of - # order > 2 (i.e. history-length > 1) that have total count below - # 'min_count'; when computing the total counts, we include higher-order - # LM-states that would back off to 'this' lm-state, in the total. - def CompletelyDiscountLowCountStates(self, min_count): - hist_to_total_count = self.GetHistToTotalCount() - for n in reversed(range(2, self.ngram_order)): - this_order_counts = self.counts[n] - for hist in this_order_counts.keys(): - if hist_to_total_count[hist] < min_count: - # we need to completely back off this count. - word_to_count = this_order_counts[hist] - del this_order_counts[hist] # delete the key from the dict. - backoff_hist = hist[1:] # this will be a tuple not a list. - for word, count in word_to_count.items(): - self.AddCount(backoff_hist, word, count) - - - - # This backs off the counts according to Kneser-Ney (unmodified, - # with interpolation). - def ApplyBackoff(self, D): - assert D > 0.0 and D < 1.0 - for n in reversed(range(1, self.ngram_order)): - this_order_counts = self.counts[n] - for hist, word_to_count in this_order_counts.items(): - backoff_hist = hist[1:] - backoff_word_to_count = self.counts[n-1][backoff_hist] - this_discount_total = 0.0 - for word in word_to_count: - assert word_to_count[word] >= 1.0 - word_to_count[word] -= D - this_discount_total += D - # Interpret the following line as incrementing the - # count-of-counts for the next-lower order. - backoff_word_to_count[word] += 1.0 - word_to_count[self.backoff_symbol] += this_discount_total - - - # This function prints out to stderr the n-gram counts stored in this - # object; it's used for debugging. - def Print(self, info_string): - print(info_string, file=sys.stderr) - # these are useful for debug. - total = 0.0 - total_excluding_backoff = 0.0 - for this_order_counts in self.counts: - for hist, word_to_count in this_order_counts.items(): - this_total_count = sum(word_to_count.values()) - print(str(hist) + ': total={0} '.format(this_total_count), - end='', file=sys.stderr) - print(' '.join(['{0} -> {1} '.format(word, count) - for word, count in word_to_count.items() ]), - file = sys.stderr) - total += this_total_count - total_excluding_backoff += this_total_count - if self.backoff_symbol in word_to_count: - total_excluding_backoff -= word_to_count[self.backoff_symbol] - print('total count = {0}, excluding discount = {1}'.format( - total, total_excluding_backoff), file = sys.stderr) - - def AddTopWords(self, top_words_file): - empty_history = () - word_to_count = self.counts[0][empty_history] - total = sum(word_to_count.values()) - try: - f = open(top_words_file) - except: - sys.exit("make_one_biased_lm.py: error opening top-words file: " - "--top-words=" + top_words_file) - while True: - line = f.readline() - if line == '': - break - try: - [ word_index, prob ] = line.split() - word_index = int(word_index) - prob = float(prob) - assert word_index > 0 and prob > 0.0 - word_to_count[word_index] += prob * total - except Exception as e: - sys.exit("make_one_biased_lm.py: could not make sense of the " - "line '{0}' in op-words file: {1} ".format(line, str(e))) - f.close() - - - def GetTotalCountMap(self): - # This function, called from PrintAsFst, returns a map from - # history to the total-count for that state. - total_count_map = dict() - for n in range(0, self.ngram_order): - for hist, word_to_count in self.counts[n].items(): - total_count_map[hist] = sum(word_to_count.values()) - return total_count_map - - def GetHistToStateMap(self): - # This function, called from PrintAsFst, returns a map from - # history to integer FST-state. - hist_to_state = dict() - fst_state_counter = 0 - for n in range(0, self.ngram_order): - for hist in self.counts[n].keys(): - hist_to_state[hist] = fst_state_counter - fst_state_counter += 1 - return hist_to_state - - def GetProb(self, hist, word, total_count_map): - total_count = total_count_map[hist] - word_to_count = self.counts[len(hist)][hist] - prob = word_to_count[word] / total_count - if len(hist) > 0 and word != self.backoff_symbol: - prob_in_backoff = self.GetProb(hist[1:], word, total_count_map) - backoff_prob = word_to_count[self.backoff_symbol] / total_count - prob += backoff_prob * prob_in_backoff - return prob - - # This function prints the estimated language model as an FST. - def PrintAsFst(self, word_disambig_symbol): - # n is the history-length (== order + 1). We iterate over the - # history-length in the order 1, 0, 2, 3, and then iterate over the - # histories of each order in sorted order. Putting order 1 first - # and sorting on the histories - # ensures that the bigram state with as the left context comes first. - # (note: self.bos_symbol is the most negative symbol) - - # History will map from history (as a tuple) to integer FST-state. - hist_to_state = self.GetHistToStateMap() - total_count_map = self.GetTotalCountMap() - - for n in [ 1, 0 ] + range(2, self.ngram_order): - this_order_counts = self.counts[n] - # For order 1, make sure the keys are sorted. - keys = this_order_counts.keys() if n != 1 else sorted(this_order_counts.keys()) - for hist in keys: - word_to_count = this_order_counts[hist] - this_fst_state = hist_to_state[hist] - - for word in word_to_count.keys(): - # work out this_cost. Costs in OpenFst are negative logs. - this_cost = -math.log(self.GetProb(hist, word, total_count_map)) - - if word > 0: # a real word. - next_hist = hist + (word,) # appending tuples - while not next_hist in hist_to_state: - next_hist = next_hist[1:] - next_fst_state = hist_to_state[next_hist] - print(this_fst_state, next_fst_state, word, word, - this_cost) - elif word == self.eos_symbol: - # print final-prob for this state. - print(this_fst_state, this_cost) - else: - assert word == self.backoff_symbol - backoff_fst_state = hist_to_state[hist[1:len(hist)]] - print(this_fst_state, backoff_fst_state, - word_disambig_symbol, 0, this_cost) - - -ngram_counts = NgramCounts(args.ngram_order) -ngram_counts.AddRawCountsFromStandardInput() - -if args.verbose >= 3: - ngram_counts.Print("Raw counts:") -ngram_counts.CompletelyDiscountLowCountStates(args.min_lm_state_count) -if args.verbose >= 3: - ngram_counts.Print("Counts after discounting low-count states:") -ngram_counts.ApplyBackoff(args.discounting_constant) -if args.verbose >= 3: - ngram_counts.Print("Counts after applying Kneser-Ney discounting:") -if args.top_words != None: - ngram_counts.AddTopWords(args.top_words) - if args.verbose >= 3: - ngram_counts.Print("Counts after applying top-n-words") -ngram_counts.PrintAsFst(args.word_disambig_symbol) - - -# test comand: -# (echo 6 7 8 4; echo 7 8 9; echo 7 8) | ./make_one_biased_lm.py --word-disambig-symbol=1000 --min-lm-state-count=2 --verbose=3 --top-words=<(echo 1 0.5; echo 2 0.25) diff --git a/egs/wsj/s5/steps/cleanup/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/modify_ctm_edits.py deleted file mode 100755 index 1022196a456..00000000000 --- a/egs/wsj/s5/steps/cleanup/modify_ctm_edits.py +++ /dev/null @@ -1,428 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2016 Vimal Manohar -# 2016 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -from __future__ import print_function -import sys, operator, argparse, os -from collections import defaultdict - -# This script reads and writes the 'ctm-edits' file that is -# produced by get_ctm_edits.py. - -# It modifies the ctm-edits so that non-scored words -# are not counted as errors: for instance, if there are things like -# [COUGH] and [NOISE] in the transcript, deletions, insertions and -# substitutions involving them are allowed, and we modify the reference -# to correspond to the hypothesis. -# -# If you supply the directory (the one that corresponds to -# how you decoded the data) to this script, it assumes that the -# directory contains phones/align_lexicon.int, and it uses this to work -# out a reasonable guess of the non-scored phones, based on which have -# a single-word pronunciation that maps to a silence phone. -# It then uses the words.txt to work out the written form of those words. -# -# Alternatively, you may specify a file containing the non-scored words one -# per line, with the --non-scored-words option. -# -# Non-scored words that were deleted (i.e. they were in the ref but not the -# hyp) are simply removed from the ctm. For non-scored words that -# were inserted or substituted, we change the reference word to match the -# hyp word, but instead of marking the operation as 'cor' (correct), we -# mark it as 'fix' (fixed), so that it will not be positively counted as a correct -# word for purposes of finding the optimal segment boundaries. -# -# e.g. -# -# [note: the will always be 1]. - -# AJJacobs_2007P-0001605-0003029 1 0 0.09 1.0 sil -# AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor -# AJJacobs_2007P-0001605-0003029 1 0.24 0.25 thought 1.0 thought cor -# AJJacobs_2007P-0001605-0003029 1 0.49 0.14 i'd 1.0 i'd cor -# AJJacobs_2007P-0001605-0003029 1 0.63 0.22 tell 1.0 tell cor -# AJJacobs_2007P-0001605-0003029 1 0.85 0.11 you 1.0 you cor -# AJJacobs_2007P-0001605-0003029 1 0.96 0.05 a 1.0 a cor -# AJJacobs_2007P-0001605-0003029 1 1.01 0.24 little 1.0 little cor -# AJJacobs_2007P-0001605-0003029 1 1.25 0.5 about 1.0 about cor -# AJJacobs_2007P-0001605-0003029 1 1.75 0.48 [UH] 1.0 [UH] cor - - -parser = argparse.ArgumentParser( - description = "This program modifies the reference in the ctm-edits which " - "is output by steps/cleanup/get_ctm_edits.py, to allow insertions, deletions and " - "substitutions of non-scored words, and [if --allow-repetitions=true], " - "duplications of single words or pairs of scored words (to account for dysfluencies " - "that were not transcribed). Note: deletions and substitutions of non-scored words " - "after the reference is corrected, will be marked as operation 'fix' rather than " - "'cor' (correct) so that the downstream processing knows that this was not in " - "the original reference. Also by defaults tags non-scored words as such when " - "they are correct; see the --tag-non-scored option.") - -parser.add_argument("--verbose", type = int, default = 1, - choices=[0,1,2,3], - help = "Verbose level, higher = more verbose output") -parser.add_argument("--allow-repetitions", type = str, default = 'true', - choices=['true','false'], - help = "If true, allow repetitions in the transcript of one or " - "two-word sequences: for instance if the ref says 'i' but " - "the hyp says 'i i', or the ref says 'but then' and the hyp says " - "'but then but then', fix the reference accordingly. Intervening " - "non-scored words are allowed between the repetitions. These " - "fixes will be marked as 'cor', not as 'fix', since there is " - "generally no way to tell which repetition was the 'real' one " - "(and since we're generally confident that such things were " - "actually uttered).") -parser.add_argument("non_scored_words_in", metavar = "", - help="Filename of file containing a list of non-scored words, " - "one per line. See steps/cleanup/get_nonscored_words.py.") -parser.add_argument("ctm_edits_in", metavar = "", - help = "Filename of input ctm-edits file. " - "Use /dev/stdin for standard input.") -parser.add_argument("ctm_edits_out", metavar = "", - help = "Filename of output ctm-edits file. " - "Use /dev/stdout for standard output.") - -args = parser.parse_args() - - - -def ReadNonScoredWords(non_scored_words_file): - global non_scored_words - try: - f = open(non_scored_words_file) - except: - sys.exit("modify_ctm_edits.py: error opening file: " - "--non-scored-words=" + non_scored_words_file) - for line in f.readlines(): - a = line.split() - if not len(line.split()) == 1: - sys.exit("modify_ctm_edits.py: bad line in non-scored-words " - "file {0}: {1}".format(non_scored_words_file, line)) - non_scored_words.add(a[0]) - f.close() - - - -# The ctm-edits file format is as follows [note: file-id is really utterance-id -# in this context]. -# -# e.g.: -# AJJacobs_2007P-0001605-0003029 1 0 0.09 1.0 sil -# AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor -# ... -# This function processes a single line of ctm-edits input for fixing -# "non-scored" words. The input 'a' is the split line as an array of fields. -# It modifies the object 'a'. This function returns the modified array, -# and please note that it is destructive of its input 'a'. -# If it returnso the empty array then the line is to be deleted. -def ProcessLineForNonScoredWords(a): - global num_lines, num_correct_lines, ref_change_stats - try: - assert len(a) == 8 - num_lines += 1 - # we could do: - # [ file, channel, start, duration, hyp_word, confidence, ref_word, edit_type ] = a - duration = a[3] - hyp_word = a[4] - ref_word = a[6] - edit_type = a[7] - if edit_type == 'ins': - assert ref_word == '' - if hyp_word in non_scored_words: - # insert this non-scored word into the reference. - ref_change_stats[ref_word + ' -> ' + hyp_word] += 1 - ref_word = hyp_word - edit_type = 'fix' - elif edit_type == 'del': - assert hyp_word == '' and float(duration) == 0.0 - if ref_word in non_scored_words: - ref_change_stats[ref_word + ' -> ' + hyp_word] += 1 - return [] - elif edit_type == 'sub': - if hyp_word in non_scored_words and ref_word in non_scored_words: - # we also allow replacing one non-scored word with another. - ref_change_stats[ref_word + ' -> ' + hyp_word] += 1 - ref_word = hyp_word - edit_type = 'fix' - else: - assert edit_type == 'cor' or edit_type == 'sil' - num_correct_lines += 1 - - a[4] = hyp_word - a[6] = ref_word - a[7] = edit_type - return a - - except Exception as e: - print("modify_ctm_edits.py: bad line in ctm-edits input: " + ' '.join(a), - file = sys.stderr) - print("modify_ctm_edits.py: exception was: " + str(e), - file = sys.stderr) - sys.exit(1) - -# This function processes the split lines of one utterance (as a -# list of lists of fields), to allow repetitions of words, so if the -# reference says 'i' but the hyp says 'i i', or the ref says -# 'you know' and the hyp says 'you know you know', we change the -# ref to match. -# It returns the modified list-of-lists [but note that the input -# is actually modified]. -def ProcessUtteranceForRepetitions(split_lines_of_utt): - global non_scored_words, repetition_stats - # The array 'selected_lines' will contain the indexes of of selected - # elements of 'split_lines_of_utt'. Consider split_line = - # split_lines_of_utt[i]. If the hyp and ref words in split_line are both - # either '' or non-scoreable words, we discard the index. - # Otherwise we put it into selected_lines. - selected_line_indexes = [] - # selected_edits will contain, for each element of selected_line_indexes, the - # corresponding edit_type from the original utterance previous to - # this function call ('cor', 'ins', etc.). - # - # As a special case, if there was a substitution ('sub') where the - # reference word was a non-scored word and the hyp word was a real word, - # we mark it in this array as 'ins', because for purposes of this algorithm - # it behaves the same as an insertion. - # - # Whenever we do any operation that will change the reference, we change - # all the selected_edits in the array to None so that they won't match - # any further operations. - selected_edits = [] - # selected_hyp_words will contain, for each element of selected_line_indexes, the - # corresponding hyp_word. - selected_hyp_words = [] - - for i in range(len(split_lines_of_utt)): - split_line = split_lines_of_utt[i] - hyp_word = split_line[4] - ref_word = split_line[6] - # keep_this_line will be True if we are going to keep this line in the - # 'selected lines' for further processing of repetitions. We only - # eliminate lines involving non-scored words or epsilon in both hyp - # and reference position - # [note: epsilon in hyp position for non-empty segments indicates - # optional-silence, and it does make sense to make this 'invisible', - # just like non-scored words, for the purposes of this code.] - keep_this_line = True - if (hyp_word == '' or hyp_word in non_scored_words) and \ - (ref_word == '' or ref_word in non_scored_words): - keep_this_line = False - if keep_this_line: - selected_line_indexes.append(i) - edit_type = split_line[7] - if edit_type == 'sub' and ref_word in non_scored_words: - assert not hyp_word in non_scored_words - # For purposes of this algorithm, substitution of, say, - # '[COUGH]' by 'hello' behaves like an insertion of 'hello', - # since we're willing to remove the '[COUGH]' from the - # transript. - edit_type = 'ins' - selected_edits.append(edit_type) - selected_hyp_words.append(hyp_word) - - # indexes_to_fix will be a list of indexes into 'selected_indexes' where we - # plan to fix the ref to match the hyp. - indexes_to_fix = [] - - # This loop scans for, and fixes, two-word insertions that follow, - # or precede, the corresponding correct words. - for i in range(0, len(selected_line_indexes) - 3): - this_indexes = selected_line_indexes[i:i+4] - this_hyp_words = selected_hyp_words[i:i+4] - - if this_hyp_words[0] == this_hyp_words[2] and \ - this_hyp_words[1] == this_hyp_words[3] and \ - this_hyp_words[0] != this_hyp_words[1]: - # if the hyp words were of the form [ 'a', 'b', 'a', 'b' ]... - this_edits = selected_edits[i:i+4] - if this_edits == [ 'cor', 'cor', 'ins', 'ins' ] or \ - this_edits == [ 'ins', 'ins', 'cor', 'cor' ]: - if this_edits[0] == 'cor': - indexes_to_fix += [ i+2, i+3 ] - else: - indexes_to_fix += [ i, i+1 ] - - # the next line prevents this region of the text being used - # in any further edits. - selected_edits[i:i+4] = [ None, None, None, None ] - word_pair = this_hyp_words[0] + ' ' + this_hyp_words[1] - # e.g. word_pair = 'hi there' - # add 2 because these stats are of words. - repetition_stats[word_pair] += 2 - # the next line prevents this region of the text being used - # in any further edits. - selected_edits[i:i+4] = [ None, None, None, None ] - - # This loop scans for, and fixes, one-word insertions that follow, - # or precede, the corresponding correct words. - for i in range(0, len(selected_line_indexes) - 1): - this_indexes = selected_line_indexes[i:i+2] - this_hyp_words = selected_hyp_words[i:i+2] - - if this_hyp_words[0] == this_hyp_words[1]: - # if the hyp words were of the form [ 'a', 'a' ]... - this_edits = selected_edits[i:i+2] - if this_edits == [ 'cor', 'ins' ] or this_edits == [ 'ins', 'cor' ]: - if this_edits[0] == 'cor': - indexes_to_fix.append(i+1) - else: - indexes_to_fix.append(i) - repetition_stats[this_hyp_words[0]] += 1 - # the next line prevents this region of the text being used - # in any further edits. - selected_edits[i:i+2] = [ None, None ] - - for i in indexes_to_fix: - j = selected_line_indexes[i] - split_line = split_lines_of_utt[j] - ref_word = split_line[6] - hyp_word = split_line[4] - assert ref_word == '' or ref_word in non_scored_words - # we replace reference with the decoded word, which will be a - # repetition. - split_line[6] = hyp_word - split_line[7] = 'cor' - - return split_lines_of_utt - - -# note: split_lines_of_utt is a list of lists, one per line, each containing the -# sequence of fields. -# Returns the same format of data after processing. -def ProcessUtterance(split_lines_of_utt): - new_split_lines_of_utt = [] - for split_line in split_lines_of_utt: - new_split_line = ProcessLineForNonScoredWords(split_line) - if new_split_line != []: - new_split_lines_of_utt.append(new_split_line) - if args.allow_repetitions == 'true': - new_split_lines_of_utt = ProcessUtteranceForRepetitions(new_split_lines_of_utt) - return new_split_lines_of_utt - - -def ProcessData(): - try: - f_in = open(args.ctm_edits_in) - except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits input " - "file {0}".format(args.ctm_edits_in)) - try: - f_out = open(args.ctm_edits_out, 'w') - except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits output " - "file {0}".format(args.ctm_edits_out)) - num_lines_processed = 0 - - - # Most of what we're doing in the lines below is splitting the input lines - # and grouping them per utterance, before giving them to ProcessUtterance() - # and then printing the modified lines. - first_line = f_in.readline() - if first_line == '': - sys.exit("modify_ctm_edits.py: empty input") - split_pending_line = first_line.split() - if len(split_pending_line) == 0: - sys.exit("modify_ctm_edits.py: bad input line " + first_line) - cur_utterance = split_pending_line[0] - split_lines_of_cur_utterance = [] - - while True: - if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance: - split_lines_of_cur_utterance = ProcessUtterance(split_lines_of_cur_utterance) - for split_line in split_lines_of_cur_utterance: - print(' '.join(split_line), file = f_out) - split_lines_of_cur_utterance = [] - if len(split_pending_line) == 0: - break - else: - cur_utterance = split_pending_line[0] - - split_lines_of_cur_utterance.append(split_pending_line) - next_line = f_in.readline() - split_pending_line = next_line.split() - if len(split_pending_line) == 0: - if next_line != '': - sys.exit("modify_ctm_edits.py: got an empty or whitespace input line") - try: - f_out.close() - except: - sys.exit("modify_ctm_edits.py: error closing ctm-edits output " - "(broken pipe or full disk?)") - -def PrintNonScoredStats(): - if args.verbose < 1: - return - if num_lines == 0: - print("modify_ctm_edits.py: processed no input.", file = sys.stderr) - num_lines_modified = sum(ref_change_stats.values()) - num_incorrect_lines = num_lines - num_correct_lines - percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines) - percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines); - percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines) - print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), " - "of which {2} were changed fixing the reference for non-scored words " - "({3}% of lines, or {4}% of incorrect lines)".format( - num_lines, percent_lines_incorrect, num_lines_modified, - percent_modified, percent_of_incorrect_modified), - file = sys.stderr) - - keys = sorted(ref_change_stats.keys(), reverse=True, - key = lambda x: ref_change_stats[x]) - num_keys_to_print = 40 if args.verbose >= 2 else 10 - - print("modify_ctm_edits.py: most common edits (as percentages " - "of all such edits) are:\n" + - ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified) - for k in keys[0:num_keys_to_print]])) - + '\n...'if num_keys_to_print < len(keys) else '', - file = sys.stderr) - - -def PrintRepetitionStats(): - if args.verbose < 1 or sum(repetition_stats.values()) == 0: - return - num_lines_modified = sum(repetition_stats.values()) - num_incorrect_lines = num_lines - num_correct_lines - percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines) - percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines); - percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines) - print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), " - "of which {2} were changed fixing the reference for repetitions ({3}% of " - "lines, or {4}% of incorrect lines)".format( - num_lines, percent_lines_incorrect, num_lines_modified, - percent_modified, percent_of_incorrect_modified), - file = sys.stderr) - - keys = sorted(repetition_stats.keys(), reverse=True, - key = lambda x: repetition_stats[x]) - num_keys_to_print = 40 if args.verbose >= 2 else 10 - - print("modify_ctm_edits.py: most common repetitions inserted into reference (as percentages " - "of all words fixed in this way) are:\n" + - ('\n'.join([ '%s [%.2f%%]' % (k, repetition_stats[k]*100.0/num_lines_modified) - for k in keys[0:num_keys_to_print]])) - + '\n...' if num_keys_to_print < len(keys) else '', - file = sys.stderr) - - -non_scored_words = set() -ReadNonScoredWords(args.non_scored_words_in) - -num_lines = 0 -num_correct_lines = 0 -# ref_change_stats will be a map from a string like -# 'foo -> bar' to an integer count; it keeps track of how much we changed -# the reference. -ref_change_stats = defaultdict(int) -# repetition_stats will be a map from strings like -# 'a', or 'a b' (the repeated strings), to an integer count; like -# ref_change_stats, it keeps track of how many changes we made -# in allowing repetitions. -repetition_stats = defaultdict(int) - -ProcessData() -PrintNonScoredStats() -PrintRepetitionStats() diff --git a/egs/wsj/s5/steps/cleanup/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/segment_ctm_edits.py deleted file mode 100755 index 7e635d66169..00000000000 --- a/egs/wsj/s5/steps/cleanup/segment_ctm_edits.py +++ /dev/null @@ -1,1034 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2016 Vimal Manohar -# 2016 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -from __future__ import print_function -import sys, operator, argparse, os -from collections import defaultdict - -# This script reads 'ctm-edits' file format that is produced by get_ctm_edits.py -# and modified by modify_ctm_edits.py and taint_ctm_edits.py Its function is to -# produce a segmentation and text from the ctm-edits input. - -# The ctm-edits file format that this script expects is as follows -# ['tainted'] -# [note: file-id is really utterance-id at this point]. - -parser = argparse.ArgumentParser( - description = "This program produces segmentation and text information " - "based on reading ctm-edits input format which is produced by " - "steps/cleanup/get_ctm_edits.py, steps/cleanup/modify_ctm_edits.py and " - "steps/cleanup/taint_ctm_edits.py.") - -parser.add_argument("--min-segment-length", type = float, default = 0.5, - help = "Minimum allowed segment length (in seconds) for any " - "segment; shorter segments than this will be discarded.") -parser.add_argument("--min-new-segment-length", type = float, default = 1.0, - help = "Minimum allowed segment length (in seconds) for newly " - "created segments (i.e. not identical to the input utterances)" - "Expected to be >= --min-segment-length.") -parser.add_argument("--frame-length", type = float, default = 0.01, - help = "This only affects rounding of the output times; they will " - "be constrained to multiples of this value.") -parser.add_argument("--max-tainted-length", type = float, default = 0.05, - help = "Maximum allowed length of any 'tainted' line. Note: " - "'tainted' lines may only appear at the boundary of a " - "segment") -parser.add_argument("--max-edge-silence-length", type = float, default = 0.5, - help = "Maximum allowed length of silence if it appears at the " - "edge of a segment (will be truncated). This rule is " - "relaxed if such truncation would take a segment below " - "the --min-segment-length or --min-new-segment-length.") -parser.add_argument("--max-edge-non-scored-length", type = float, default = 0.5, - help = "Maximum allowed length of a non-scored word (noise, cough, etc.) " - "if it appears at the edge of a segment (will be truncated). " - "This rule is relaxed if such truncation would take a " - "segment below the --min-segment-length.") -parser.add_argument("--max-internal-silence-length", type = float, default = 2.0, - help = "Maximum allowed length of silence if it appears inside a segment " - "(will cause the segment to be split).") -parser.add_argument("--max-internal-non-scored-length", type = float, default = 2.0, - help = "Maximum allowed length of a non-scored word (noise, etc.) if " - "it appears inside a segment (will cause the segment to be " - "split). Note: reference words which are real words but OOV " - "are not included in this category.") -parser.add_argument("--unk-padding", type = float, default = 0.05, - help = "Amount of padding with that we do if a segment boundary is " - "next to errors (ins, del, sub). That is, we add this amount of " - "time to the segment and add the word to cover the acoustics. " - "If nonzero, the --oov-symbol-file option must be supplied.") -parser.add_argument("--max-junk-proportion", type = float, default = 0.1, - help = "Maximum proportion of the time of the segment that may " - "consist of potentially bad data, in which we include 'tainted' lines of " - "the ctm-edits input and unk-padding.") -parser.add_argument("--max-deleted-words-kept-when-merging", type = str, default = 1, - help = "When merging segments that are found to be overlapping or " - "adjacent after all other processing, keep in the transcript the " - "reference words that were deleted between the segments [if any] " - "as long as there were no more than this many reference words. " - "Setting this to zero will mean that any reference words that " - "were deleted between the segments we're about to reattach will " - "not appear in the generated transcript (so we'll match the hyp).") -parser.add_argument("--oov-symbol-file", type = str, default = None, - help = "Filename of file such as data/lang/oov.txt which contains " - "the text form of the OOV word, normally ''. Supplied as " - "a file to avoid complications with escaping. Necessary if " - "the --unk-padding option has a nonzero value (which it does " - "by default.") -parser.add_argument("--ctm-edits-out", type = str, - help = "Filename to output an extended version of the ctm-edits format " - "with segment start and end points noted. This file is intended to be " - "read by humans; there are currently no scripts that will read it.") -parser.add_argument("--word-stats-out", type = str, - help = "Filename for output of word-level stats, of the form " - "' ', e.g. 'hello 0.12 12408', " - "where the is the proportion of the time that this " - "reference word does not make it into a segment. It can help reveal words " - "that have problematic pronunciations or are associated with " - "transcription errors.") - - -parser.add_argument("non_scored_words_in", metavar = "", - help="Filename of file containing a list of non-scored words, " - "one per line. See steps/cleanup/get_nonscored_words.py.") -parser.add_argument("ctm_edits_in", metavar = "", - help = "Filename of input ctm-edits file. " - "Use /dev/stdin for standard input.") -parser.add_argument("text_out", metavar = "", - help = "Filename of output text file (same format as data/train/text, i.e. " - " ... ") -parser.add_argument("segments_out", metavar = "", - help = "Filename of output segments. This has the same format as data/train/segments, " - "but instead of , the second field is the old utterance-id, i.e " - " ") - -args = parser.parse_args() - - - - -def IsTainted(split_line_of_utt): - return len(split_line_of_utt) > 8 and split_line_of_utt[8] == 'tainted' - -# This function returns a list of pairs (start-index, end-index) representing -# the cores of segments (so if a pair is (s, e), then the core of a segment -# would span (s, s+1, ... e-1). -# -# By the 'core of a segment', we mean a sequence of ctm-edits lines including at -# least one 'cor' line and a contiguous sequence of other lines of the type -# 'cor', 'fix' and 'sil' that must be not tainted. The segment core excludes -# any tainted lines at the edge of a segment, which will be added later. -# -# We only initiate segments when it contains something correct and not realized -# as unk (i.e. ref==hyp); and we extend it with anything that is 'sil' or 'fix' -# or 'cor' that is not tainted. Contiguous regions of 'true' in the resulting -# boolean array will then become the cores of prototype segments, and we'll add -# any adjacent tainted words (or parts of them). -def ComputeSegmentCores(split_lines_of_utt): - num_lines = len(split_lines_of_utt) - line_is_in_segment_core = [ False] * num_lines - for i in range(num_lines): - if split_lines_of_utt[i][7] == 'cor' and \ - split_lines_of_utt[i][4] == split_lines_of_utt[i][6]: - line_is_in_segment_core[i] = True - - # extend each proto-segment forwards as far as we can: - for i in range(1, num_lines): - if line_is_in_segment_core[i-1] and not line_is_in_segment_core[i]: - edit_type = split_lines_of_utt[i][7] - if not IsTainted(split_lines_of_utt[i]) and \ - (edit_type == 'cor' or edit_type == 'sil' or edit_type == 'fix'): - line_is_in_segment_core[i] = True - - # extend each proto-segment backwards as far as we can: - for i in reversed(range(0, num_lines - 1)): - if line_is_in_segment_core[i+1] and not line_is_in_segment_core[i]: - edit_type = split_lines_of_utt[i][7] - if not IsTainted(split_lines_of_utt[i]) and \ - (edit_type == 'cor' or edit_type == 'sil' or edit_type == 'fix'): - line_is_in_segment_core[i] = True - - - segment_ranges = [] - cur_segment_start = None - for i in range(0, num_lines): - if line_is_in_segment_core[i]: - if cur_segment_start == None: - cur_segment_start = i - else: - if cur_segment_start != None: - segment_ranges.append( (cur_segment_start, i) ) - cur_segment_start = None - if cur_segment_start != None: - segment_ranges.append( (cur_segment_start, num_lines) ) - - return segment_ranges - -class Segment: - def __init__(self, split_lines_of_utt, start_index, end_index, debug_str = None): - self.split_lines_of_utt = split_lines_of_utt - # start_index is the index of the first line that appears in this - # segment, and end_index is one past the last line. This does not - # include unk-padding. - self.start_index = start_index - self.end_index = end_index - # If the following values are nonzero, then when we create the segment - # we will add at the start and end of the segment [representing - # partial words], with this amount of additional audio. - self.start_unk_padding = 0.0 - self.end_unk_padding = 0.0 - - # debug_str keeps track of the 'core' of the segment. - if debug_str == None: - debug_str = 'core-start={0},core-end={1}'.format(start_index,end_index) - self.debug_str = debug_str - - # This gives the proportion of the time of the first line in the segment - # that we keep. Usually 1.0 but may be less if we've trimmed away some - # proportion of the time. - self.start_keep_proportion = 1.0 - # This gives the proportion of the time of the last line in the segment - # that we keep. Usually 1.0 but may be less if we've trimmed away some - # proportion of the time. - self.end_keep_proportion = 1.0 - - # This is stage 1 of segment processing (after creating the boundaries of the - # core of the segment, which is done outside of this class).a - # - # This function may reduce start_index and/or increase end_index by - # including a single adjacent 'tainted' line from the ctm-edits file. This - # is only done if the lines at the boundaries of the segment are currently - # real non-silence words and not non-scored words. The idea is that we - # probably don't want to start or end the segment right at the boundary of a - # real word, we want to add some kind of padding. - def PossiblyAddTaintedLines(self): - global non_scored_words - split_lines_of_utt = self.split_lines_of_utt - # we're iterating over the segment (start, end) - for b in [False, True]: - if b: - boundary_index = self.end_index - 1 - adjacent_index = self.end_index - else: - boundary_index = self.start_index - adjacent_index = self.start_index - 1 - if adjacent_index >= 0 and adjacent_index < len(split_lines_of_utt): - # only consider merging the adjacent word into the segment if we're not - # at a segment boundary. - adjacent_line_is_tainted = IsTainted(split_lines_of_utt[adjacent_index]) - # if the adjacent line wasn't tainted, then there must have been - # another stronger reason why we didn't include it in the core - # of the segment (probably that it was an ins, del or sub), so - # there is no point considering it. - if adjacent_line_is_tainted: - boundary_edit_type = split_lines_of_utt[boundary_index][7] - boundary_hyp_word = split_lines_of_utt[boundary_index][7] - # we only add the tainted line to the segment if the word at - # the boundary was a non-silence word that was correctly - # decoded and not fixed [see modify_ctm_edits.py.] - if boundary_edit_type == 'cor' and \ - not boundary_hyp_word in non_scored_words: - # Add the adjacent tainted line to the segment. - if b: - self.end_index += 1 - else: - self.start_index -= 1 - - # This is stage 2 of segment processing. - # This function will split a segment into multiple pieces if any of the - # internal [non-boundary] silences or non-scored words are longer - # than the allowed values --max-internal-silence-length and - # --max-internal-non-scored-length. This function returns a - # list of segments. In the normal case (where there is no splitting) - # it just returns an array with a single element 'self'. - def PossiblySplitSegment(self): - global non_scored_words, args - # make sure the segment hasn't been processed more than we expect. - assert self.start_unk_padding == 0.0 and self.end_unk_padding == 0.0 and \ - self.start_keep_proportion == 1.0 and self.end_keep_proportion == 1.0 - segments = [] # the answer - cur_start_index = self.start_index - cur_start_is_split = False - # only consider splitting at non-boundary lines. [we'd just truncate - # the boundary lines.] - for index_to_split_at in range(cur_start_index + 1, self.end_index - 1): - this_split_line = self.split_lines_of_utt[index_to_split_at] - this_duration = float(this_split_line[3]) - this_edit_type = this_split_line[7] - this_ref_word = this_split_line[6] - if (this_edit_type == 'sil' and this_duration > args.max_internal_silence_length) or \ - (this_ref_word in non_scored_words and this_duration > args.max_internal_non_scored_length): - # We split this segment at this index, dividing the word in two - # [later on, in PossiblyTruncateBoundaries, it may be further - # truncated.] - # Note: we use 'index_to_split_at + 1' because the Segment constructor - # takes an 'end-index' which is interpreted as one past the end. - new_segment = Segment(self.split_lines_of_utt, cur_start_index, - index_to_split_at + 1, self.debug_str) - if cur_start_is_split: - new_segment.start_keep_proportion = 0.5 - new_segment.end_keep_proportion = 0.5 - cur_start_is_split = True - cur_start_index = index_to_split_at - segments.append(new_segment) - if len(segments) == 0: # We did not split. - segments.append(self) - else: - # We did split. Add the very last segment. - new_segment = Segment(self.split_lines_of_utt, cur_start_index, - self.end_index, self.debug_str) - assert cur_start_is_split - new_segment.start_keep_proportion = 0.5 - segments.append(new_segment) - return segments - - - # This is stage 3 of segment processing. It will truncate the silences and - # non-scored words at the segment boundaries if they are longer than the - # --max-edge-silence-length and --max-edge-non-scored-length respectively - # (and to the extent that this wouldn't take us below the - # --min-segment-length or --min-new-segment-length). - def PossiblyTruncateBoundaries(self): - for b in [True, False]: - if b: - this_index = self.start_index - else: - this_index = self.end_index - 1 - this_split_line = self.split_lines_of_utt[this_index] - truncated_duration = None - this_duration = float(this_split_line[3]) - this_edit = this_split_line[7] - this_ref_word = this_split_line[6] - if this_edit == 'sil' and \ - this_duration > args.max_edge_silence_length: - truncated_duration = args.max_edge_silence_length - elif this_ref_word in non_scored_words and \ - this_duration > args.max_edge_non_scored_length: - truncated_duration = args.max_edge_non_scored_length - if truncated_duration != None: - keep_proportion = truncated_duration / this_duration - if b: - self.start_keep_proportion = keep_proportion - else: - self.end_keep_proportion = keep_proportion - - # This relaxes the segment-boundary truncation of - # PossiblyTruncateBoundaries(), if it would take us below - # min-new-segment-length or min-segment-length. Note: this does not relax - # the boundary truncation for a particular boundary (start or end) if that - # boundary corresponds to a 'tainted' line of the ctm (because it's - # dangerous to include too much 'tainted' audio). - def RelaxBoundaryTruncation(self): - # this should be called before adding unk padding. - assert self.start_unk_padding == self.end_unk_padding == 0.0 - if self.start_keep_proportion == self.end_keep_proportion == 1.0: - return # nothing to do there was no truncation. - length_cutoff = max(args.min_new_segment_length, args.min_segment_length) - length_with_truncation = self.Length() - if length_with_truncation >= length_cutoff: - return # Nothing to do. - orig_start_keep_proportion = self.start_keep_proportion - orig_end_keep_proportion = self.end_keep_proportion - if not IsTainted(self.split_lines_of_utt[self.start_index]): - self.start_keep_proportion = 1.0 - if not IsTainted(self.split_lines_of_utt[self.end_index - 1]): - self.end_keep_proportion = 1.0 - length_with_relaxed_boundaries = self.Length() - if length_with_relaxed_boundaries <= length_cutoff: - # Completely undo the truncation [to the extent allowed by the - # presence of tainted lines at the start/end] if, even without - # truncation, we'd be below the length cutoff. This segment may be - # removed later on (but it may not, if removing truncation makes us - # identical to the input utterance, and the length is between - # min_segment_length min_new_segment_length). - return - # Next, compute an interpolation constant a such that the - # {start,end}_keep_proportion values will equal a * - # [values-computed-by-PossiblyTruncateBoundaries()] + (1-a) * [completely-relaxed-values]. - # we're solving the equation: - # length_cutoff = a * length_with_truncation + (1-a) * length_with_relaxed_boundaries - # -> length_cutoff - length_with_relaxed_boundaries = - # a * (length_with_truncation - length_with_relaxed_boundaries) - # -> a = (length_cutoff - length_with_relaxed_boundaries) / (length_with_truncation - length_with_relaxed_boundaries) - a = (length_cutoff - length_with_relaxed_boundaries) / \ - (length_with_truncation - length_with_relaxed_boundaries) - if a < 0.0 or a > 1.0: - print("segment_ctm_edits.py: bad 'a' value = {0}".format(a), file = sys.stderr) - return - self.start_keep_proportion = \ - a * orig_start_keep_proportion + (1-a) * self.start_keep_proportion - self.end_keep_proportion = \ - a * orig_end_keep_proportion + (1-a) * self.end_keep_proportion - if not abs(self.Length() - length_cutoff) < 0.01: - print("segment_ctm_edits.py: possible problem relaxing boundary " - "truncation, length is {0} vs {1}".format(self.Length(), length_cutoff), - file = sys.stderr) - - - # This is stage 4 of segment processing. - # This function may set start_unk_padding and end_unk_padding to nonzero - # values. This is done if the current boundary words are real, scored - # words and we're not next to the beginning or end of the utterance. - def PossiblyAddUnkPadding(self): - for b in [True, False]: - if b: - this_index = self.start_index - else: - this_index = self.end_index - 1 - this_split_line = self.split_lines_of_utt[this_index] - this_start_time = float(this_split_line[2]) - this_ref_word = this_split_line[6] - this_edit = this_split_line[7] - if this_edit == 'cor' and not this_ref_word in non_scored_words: - # we can consider adding unk-padding. - if b: # start of utterance. - unk_padding = args.unk_padding - if unk_padding > this_start_time: # close to beginning of file - unk_padding = this_start_time - # If we could add less than half of the specified - # unk-padding, don't add any (because when we add - # unk-padding we add the unknown-word symbol '', and if - # there isn't enough space to traverse the HMM we don't want - # to do it at all. - if unk_padding < 0.5 * args.unk_padding: - unk_padding = 0.0 - self.start_unk_padding = unk_padding - else: # end of utterance. - this_end_time = this_start_time + float(this_split_line[3]) - last_line = self.split_lines_of_utt[-1] - utterance_end_time = float(last_line[2]) + float(last_line[3]) - max_allowable_padding = utterance_end_time - this_end_time - assert max_allowable_padding > -0.01 - unk_padding = args.unk_padding - if unk_padding > max_allowable_padding: - unk_padding = max_allowable_padding - # If we could add less than half of the specified - # unk-padding, don't add any (because when we add - # unk-padding we add the unknown-word symbol '', and if - # there isn't enough space to traverse the HMM we don't want - # to do it at all. - if unk_padding < 0.5 * args.unk_padding: - unk_padding = 0.0 - self.end_unk_padding = unk_padding - - # This function will merge the segment in 'other' with the segment - # in 'self'. It is only to be called when 'self' and 'other' are from - # the same utterance, 'other' is after 'self' in time order (based on - # the original segment cores), and self.EndTime() >= other.StartTime(). - # Note: in this situation there will normally be deleted words - # between the two segments. What this program does with the deleted - # words depends on '--max-deleted-words-kept-when-merging'. If there - # were any inserted words in the transcript (less likely), this - # program will keep the reference. - def MergeWithSegment(self, other): - assert self.EndTime() >= other.StartTime() and \ - self.StartTime() < other.EndTime() and \ - self.split_lines_of_utt is other.split_lines_of_utt - orig_self_end_index = self.end_index - self.debug_str = "({0}/merged-with/{1})".format(self.debug_str, other.debug_str) - # everything that relates to the end of this segment gets copied - # from 'other'. - self.end_index = other.end_index - self.end_unk_padding = other.end_unk_padding - self.end_keep_proportion = other.end_keep_proportion - # The next thing we have to do is to go over any lines of the ctm that - # appear between 'self' and 'other', or are shared between both (this - # would only happen for tainted silence or non-scored-word segments), - # and decide what to do with them. We'll keep the reference for any - # substitutions or insertions (which anyway are unlikely to appear - # in these merged segments). Note: most of this happens in self.Text(), - # but at this point we need to decide whether to mark any deletions - # as 'discard-this-word'. - first_index_of_overlap = min(orig_self_end_index - 1, other.start_index) - last_index_of_overlap = max(orig_self_end_index - 1, other.start_index) - num_deleted_words = 0 - for i in range(first_index_of_overlap, last_index_of_overlap + 1): - edit_type = self.split_lines_of_utt[i][7] - if edit_type == 'del': - num_deleted_words += 1 - if num_deleted_words > args.max_deleted_words_kept_when_merging: - for i in range(first_index_of_overlap, last_index_of_overlap + 1): - if self.split_lines_of_utt[i][7] == 'del': - self.split_lines_of_utt[i].append('do-not-include-in-text') - - # Returns the start time of the utterance (within the enclosing utterance) - # This is before any rounding. - def StartTime(self): - first_line = self.split_lines_of_utt[self.start_index] - first_line_start = float(first_line[2]) - first_line_duration = float(first_line[3]) - first_line_end = first_line_start + first_line_duration - return first_line_end - self.start_unk_padding \ - - (first_line_duration * self.start_keep_proportion) - - # Returns some string-valued information about 'this' that is useful for debugging. - def DebugInfo(self): - return 'start=%d,end=%d,unk-padding=%.2f,%.2f,keep-proportion=%.2f,%.2f,' % \ - (self.start_index, self.end_index, self.start_unk_padding, - self.end_unk_padding, self.start_keep_proportion, self.end_keep_proportion) + \ - self.debug_str - - # Returns the start time of the utterance (within the enclosing utterance) - def EndTime(self): - last_line = self.split_lines_of_utt[self.end_index - 1] - last_line_start = float(last_line[2]) - last_line_duration = float(last_line[3]) - return last_line_start + (last_line_duration * self.end_keep_proportion) \ - + self.end_unk_padding - - # Returns the segment length in seconds. - def Length(self): - return self.EndTime() - self.StartTime() - - def IsWholeUtterance(self): - # returns true if this segment corresponds to the whole utterance that - # it's a part of (i.e. its start/end time are zero and the end-time of - # the last segment. - last_line_of_utt = self.split_lines_of_utt[-1] - last_line_end_time = float(last_line_of_utt[2]) + float(last_line_of_utt[3]) - return abs(self.StartTime() - 0.0) < 0.001 and \ - abs(self.EndTime() - last_line_end_time) < 0.001 - - # Returns the proportion of the duration of this segment that consists of - # unk-padding and tainted lines of input (will be between 0.0 and 1.0). - def JunkProportion(self): - # Note: only the first and last lines could possibly be tainted as - # that's how we create the segments; and if either or both are tainted - # the utterance must contain other lines, so double-counting is not a - # problem. - junk_duration = self.start_unk_padding + self.end_unk_padding - first_split_line = self.split_lines_of_utt[self.start_index] - if IsTainted(first_split_line): - first_duration = float(first_split_line[3]) - junk_duration += first_duration * self.start_keep_proportion - last_split_line = self.split_lines_of_utt[self.end_index - 1] - if IsTainted(last_split_line): - last_duration = float(last_split_line[3]) - junk_duration += last_duration * self.end_keep_proportion - return junk_duration / self.Length() - - # This function will remove something from the beginning of the - # segment if it's possible to cleanly lop off a bit that contains - # more junk, as a proportion of its length, than 'args.junk_proportion'. - # Junk is defined as unk-padding and/or tainted segments. - # It considers as a potential split point, the first silence - # segment or non-tainted non-scored-word segment in the - # utterance. See also TruncateEndForJunkProportion - def PossiblyTruncateStartForJunkProportion(self): - begin_junk_duration = self.start_unk_padding - first_split_line = self.split_lines_of_utt[self.start_index] - if IsTainted(first_split_line): - first_duration = float(first_split_line[3]) - begin_junk_duration += first_duration * self.start_keep_proportion - if begin_junk_duration == 0.0: - # nothing to do. - return - - candidate_start_index = None - # the following iterates over all lines internal to the utterance. - for i in range(self.start_index + 1, self.end_index - 1): - this_split_line = self.split_lines_of_utt[i] - this_edit_type = this_split_line[7] - this_ref_word = this_split_line[6] - # We'll consider splitting on silence and on non-scored words. - # (i.e. making the silence or non-scored word the left boundary of - # the new utterance and discarding the piece to the left of that). - if this_edit_type == 'sil' or \ - (this_edit_type == 'cor' and this_ref_word in non_scored_words): - candidate_start_index = i - candidate_start_time = float(this_split_line[2]) - break # Consider only the first potential truncation. - if candidate_start_index == None: - return # Nothing to do as there is no place to split. - candidate_removed_piece_duration = candidate_start_time - self.StartTime() - if begin_junk_duration / candidate_removed_piece_duration < args.max_junk_proportion: - return # Nothing to do as the candidate piece to remove has too - # little junk. - # OK, remove the piece. - self.start_index = candidate_start_index - self.start_unk_padding = 0.0 - self.start_keep_proportion = 1.0 - self.debug_str += ',truncated-start-for-junk' - - # This is like PossiblyTruncateStartForJunkProportion(), but - # acts on the end of the segment; see comments there. - def PossiblyTruncateEndForJunkProportion(self): - end_junk_duration = self.end_unk_padding - last_split_line = self.split_lines_of_utt[self.end_index - 1] - if IsTainted(last_split_line): - last_duration = float(last_split_line[3]) - end_junk_duration += last_duration * self.end_keep_proportion - if end_junk_duration == 0.0: - # nothing to do. - return - - candidate_end_index = None - # the following iterates over all lines internal to the utterance - # (starting from the end). - for i in reversed(range(self.start_index + 1, self.end_index - 1)): - this_split_line = self.split_lines_of_utt[i] - this_edit_type = this_split_line[7] - this_ref_word = this_split_line[6] - # We'll consider splitting on silence and on non-scored words. - # (i.e. making the silence or non-scored word the right boundary of - # the new utterance and discarding the piece to the right of that). - if this_edit_type == 'sil' or \ - (this_edit_type == 'cor' and this_ref_word in non_scored_words): - candidate_end_index = i + 1 # note: end-indexes are one past the last. - candidate_end_time = float(this_split_line[2]) + float(this_split_line[3]) - break # Consider only the latest potential truncation. - if candidate_end_index == None: - return # Nothing to do as there is no place to split. - candidate_removed_piece_duration = self.EndTime() - candidate_end_time - if end_junk_duration / candidate_removed_piece_duration < args.max_junk_proportion: - return # Nothing to do as the candidate piece to remove has too - # little junk. - # OK, remove the piece. - self.end_index = candidate_end_index - self.end_unk_padding = 0.0 - self.end_keep_proportion = 1.0 - self.debug_str += ',truncated-end-for-junk' - - - # this will return true if there is at least one word in the utterance - # that's a scored word (not a non-scored word) and not an OOV word that's - # realized as unk. This becomes a filter on keeping segments. - def ContainsAtLeastOneScoredNonOovWord(self): - global non_scored_words - for i in range(self.start_index, self.end_index): - this_split_line = self.split_lines_of_utt[i] - this_hyp_word = this_split_line[4] - this_ref_word = this_split_line[6] - this_edit = this_split_line[7] - if this_edit == 'cor' and not this_ref_word in non_scored_words \ - and this_ref_word == this_hyp_word: - return True - return False - - # Returns the text corresponding to this utterance, as a string. - def Text(self): - global oov_symbol - text_array = [] - if self.start_unk_padding != 0.0: - text_array.append(oov_symbol) - for i in range(self.start_index, self.end_index): - this_split_line = self.split_lines_of_utt[i] - this_edit = this_split_line[7] - this_ref_word = this_split_line[6] - if this_ref_word != '' and this_split_line[-1] != 'do-not-include-in-text': - text_array.append(this_ref_word) - if self.end_unk_padding != 0.0: - text_array.append(oov_symbol) - return ' '.join(text_array) - - -# Here, 'text' will be something that indicates the stage of processing, -# e.g. 'Stage 0: segment cores', 'Stage 1: add tainted lines', -#, etc. -def AccumulateSegmentStats(segment_list, text): - global segment_total_length, num_segments - for segment in segment_list: - num_segments[text] += 1 - segment_total_length[text] += segment.Length() - -def PrintSegmentStats(): - global segment_total_length, num_segments, \ - num_utterances, num_utterances_without_segments, \ - total_length_of_utterances - - print('Number of utterances is %d, of which %.2f%% had no segments after ' - 'all processing; total length of data in original utterances (in seconds) ' - 'was %d' % (num_utterances, - num_utterances_without_segments * 100.0 / num_utterances, - total_length_of_utterances), - file = sys.stderr) - - - keys = sorted(segment_total_length.keys()) - for i in range(len(keys)): - key = keys[i] - if i > 0: - delta_percentage = '[%+.2f%%]' % ((segment_total_length[key] - segment_total_length[keys[i-1]]) - * 100.0 / total_length_of_utterances) - print('At %s, num-segments is %d, total length %.2f%% of original total %s' % ( - key, num_segments[key], - segment_total_length[key] * 100.0 / total_length_of_utterances, - delta_percentage if i > 0 else ''), - file = sys.stderr) - -# This function creates the segments for an utterance as a list -# of class Segment. -# It returns a 2-tuple (list-of-segments, list-of-deleted-segments) -# where the deleted segments are only useful for diagnostic printing. -# Note: split_lines_of_utt is a list of lists, one per line, each containing the -# sequence of fields. -def GetSegmentsForUtterance(split_lines_of_utt): - global num_utterances, num_utterances_without_segments, total_length_of_utterances - - num_utterances += 1 - - segment_ranges = ComputeSegmentCores(split_lines_of_utt) - - utterance_end_time = float(split_lines_of_utt[-1][2]) + float(split_lines_of_utt[-1][3]) - total_length_of_utterances += utterance_end_time - - segments = [ Segment(split_lines_of_utt, x[0], x[1]) - for x in segment_ranges ] - - AccumulateSegmentStats(segments, 'stage 0 [segment cores]') - for segment in segments: - segment.PossiblyAddTaintedLines() - AccumulateSegmentStats(segments, 'stage 1 [add tainted lines]') - new_segments = [] - for s in segments: - new_segments += s.PossiblySplitSegment() - segments = new_segments - AccumulateSegmentStats(segments, 'stage 2 [split segments]') - for s in segments: - s.PossiblyTruncateBoundaries() - AccumulateSegmentStats(segments, 'stage 3 [truncate boundaries]') - for s in segments: - s.RelaxBoundaryTruncation() - AccumulateSegmentStats(segments, 'stage 4 [relax boundary truncation]') - for s in segments: - s.PossiblyAddUnkPadding() - AccumulateSegmentStats(segments, 'stage 5 [unk-padding]') - - deleted_segments = [] - new_segments = [] - for s in segments: - # the 0.999 allows for roundoff error. - if (not s.IsWholeUtterance() and s.Length() < 0.999 * args.min_new_segment_length): - s.debug_str += '[deleted-because-of--min-new-segment-length]' - deleted_segments.append(s) - else: - new_segments.append(s) - segments = new_segments - AccumulateSegmentStats(segments, 'stage 6 [remove new segments under --min-new-segment-length') - - new_segments = [] - for s in segments: - # the 0.999 allows for roundoff error. - if s.Length() < 0.999 * args.min_segment_length: - s.debug_str += '[deleted-because-of--min-segment-length]' - deleted_segments.append(s) - else: - new_segments.append(s) - segments = new_segments - AccumulateSegmentStats(segments, 'stage 7 [remove segments under --min-segment-length') - - for s in segments: - s.PossiblyTruncateStartForJunkProportion() - AccumulateSegmentStats(segments, 'stage 8 [truncate segment-starts for --max-junk-proportion') - - for s in segments: - s.PossiblyTruncateEndForJunkProportion() - AccumulateSegmentStats(segments, 'stage 9 [truncate segment-ends for --max-junk-proportion') - - new_segments = [] - for s in segments: - if s.ContainsAtLeastOneScoredNonOovWord(): - new_segments.append(s) - else: - s.debug_str += '[deleted-because-no-scored-non-oov-words]' - deleted_segments.append(s) - - segments = new_segments - AccumulateSegmentStats(segments, 'stage 10 [remove segments without scored,non-OOV words]') - - new_segments = [] - for s in segments: - j = s.JunkProportion() - if j <= args.max_junk_proportion: - new_segments.append(s) - else: - s.debug_str += '[deleted-because-junk-proportion={0}]'.format(j) - deleted_segments.append(s) - - segments = new_segments - AccumulateSegmentStats(segments, 'stage 11 [remove segments with junk exceeding --max-junk-proportion]') - - new_segments = [] - if len(segments) > 0: - new_segments.append(segments[0]) - for i in range(1, len(segments)): - if new_segments[-1].EndTime() >= segments[i].StartTime(): - new_segments[-1].MergeWithSegment(segments[i]) - else: - new_segments.append(segments[i]) - segments = new_segments - AccumulateSegmentStats(segments, 'stage 12 [merge overlapping or touching segments]') - - for i in range(len(segments) - 1): - if segments[i].EndTime() > segments[i+1].StartTime(): - # this just adds something to --ctm-edits-out output - segments[i+1].debug_str += ",overlaps-previous-segment" - - if len(segments) == 0: - num_utterances_without_segments += 1 - - return (segments, deleted_segments) - -# this prints a number with a certain number of digits after -# the point, while removing trailing zeros. -def FloatToString(f): - num_digits = 6 # we want to print 6 digits after the zero - g = f - while abs(g) > 1.0: - g *= 0.1 - num_digits += 1 - format_str = '%.{0}g'.format(num_digits) - return format_str % f - -# Gives time in string form as an exact multiple of the frame-length, e.g. 0.01 -# (after rounding). -def TimeToString(time, frame_length): - n = round(time / frame_length) - assert n >= 0 - # The next function call will remove trailing zeros while printing it, so - # that e.g. 0.01 will be printed as 0.01 and not 0.0099999999999999. It - # seems that doing this in a simple way is not really possible (at least, - # not without assuming that frame_length is of the form 10^-n, which we - # don't really want to do). - return FloatToString(n * frame_length) - -def WriteSegmentsForUtterance(text_output_handle, segments_output_handle, - old_utterance_name, segments): - for n in range(len(segments)): - segment = segments[n] - # split utterances will be named foo-bar-1 foo-bar-2, etc. - new_utterance_name = old_utterance_name + "-" + str(n + 1) - # print a line to the text output of the form like - # - # like: - # foo-bar-1 hello this is dan - print(new_utterance_name, segment.Text(), file = text_output_handle) - # print a line to the segments output of the form - # - # like: - # foo-bar-1 foo-bar 5.1 7.2 - print(new_utterance_name, old_utterance_name, - TimeToString(segment.StartTime(), args.frame_length), - TimeToString(segment.EndTime(), args.frame_length), - file = segments_output_handle) - - - -# Note, this is destrutive of 'segments_for_utterance', but it won't matter. -def PrintDebugInfoForUtterance(ctm_edits_out_handle, - split_lines_of_cur_utterance, - segments_for_utterance, - deleted_segments_for_utterance): - # info_to_print will be list of 2-tuples (time, 'start-segment-n'|'end-segment-n') - # representing the start or end times of segments. - info_to_print = [] - for n in range(len(segments_for_utterance)): - segment = segments_for_utterance[n] - start_string = 'start-segment-' + str(n+1) + '[' + segment.DebugInfo() + ']' - info_to_print.append( (segment.StartTime(), start_string) ) - end_string = 'end-segment-' + str(n+1) - info_to_print.append( (segment.EndTime(), end_string) ) - # for segments that were deleted we print info like start-deleted-segment-1, and - # otherwise similar info to segments that were retained. - for n in range(len(deleted_segments_for_utterance)): - segment = deleted_segments_for_utterance[n] - start_string = 'start-deleted-segment-' + str(n+1) + '[' + segment.DebugInfo() + ']' - info_to_print.append( (segment.StartTime(), start_string) ) - end_string = 'end-deleted-segment-' + str(n+1) - info_to_print.append( (segment.EndTime(), end_string) ) - - info_to_print = sorted(info_to_print) - - for i in range(len(split_lines_of_cur_utterance)): - split_line=split_lines_of_cur_utterance[i] - split_line[0] += '[' + str(i) + ']' # add an index like [0], [1], to - # the utterance-id so we can easily - # look up segment indexes. - start_time = float(split_line[2]) - end_time = start_time + float(split_line[3]) - split_line_copy = list(split_line) - while len(info_to_print) > 0 and info_to_print[0][0] <= end_time: - (segment_start, string) = info_to_print[0] - # shift the first element off of info_to_print. - info_to_print = info_to_print[1:] - # add a field like 'start-segment1[...]=3.21' to what we're about to print. - split_line_copy.append(string + "=" + TimeToString(segment_start, args.frame_length)) - print(' '.join(split_line_copy), file = ctm_edits_out_handle) - -# This accumulates word-level stats about, for each reference word, with what -# probability it will end up in the core of a segment. Words with low -# probabilities of being in segments will generally be associated with some kind -# of error (there is a higher probability of having a wrong lexicon entry). -def AccWordStatsForUtterance(split_lines_of_utt, - segments_for_utterance): - # word_count_pair is a map from a string (the word) to - # a list [total-count, count-not-within-segments] - global word_count_pair - line_is_in_segment = [ False ] * len(split_lines_of_utt) - for segment in segments_for_utterance: - for i in range(segment.start_index, segment.end_index): - line_is_in_segment[i] = True - for i in range(len(split_lines_of_utt)): - this_ref_word = split_lines_of_utt[i][6] - if this_ref_word != '': - word_count_pair[this_ref_word][0] += 1 - if not line_is_in_segment[i]: - word_count_pair[this_ref_word][1] += 1 - -def PrintWordStats(word_stats_out): - try: - f = open(word_stats_out, 'w') - except: - sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} " - "for writing".format(word_stats_out)) - global word_count_pair - # Sort from most to least problematic. We want to give more prominence to - # words that are most frequently not in segments, but also to high-count - # words. Define badness = pair[1] / pair[0], and total_count = pair[0], - # where 'pair' is a value of word_count_pair. We'll reverse sort on - # badness^3 * total_count = pair[1]^3 / pair[0]^2. - for key, pair in sorted(word_count_pair.items(), - key = lambda item: (item[1][1] ** 3) * 1.0 / (item[1][0] ** 2), - reverse = True): - badness = pair[1] * 1.0 / pair[0] - total_count = pair[0] - print(key, badness, total_count, file = f) - try: - f.close() - except: - sys.exit("segment_ctm_edits.py: error closing file --word-stats-out={0} " - "(full disk?)".format(word_stats_out)) - print("segment_ctm_edits.py: please see the file {0} for word-level statistics " - "saying how frequently each word was excluded for a segment; format is " - " . Particularly " - "problematic words appear near the top of the file.".format(word_stats_out), - file = sys.stderr) - - -def ProcessData(): - try: - f_in = open(args.ctm_edits_in) - except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits input " - "file {0}".format(args.ctm_edits_in)) - try: - text_output_handle = open(args.text_out, 'w') - except: - sys.exit("modify_ctm_edits.py: error opening text output " - "file {0}".format(args.text_out)) - try: - segments_output_handle = open(args.segments_out, 'w') - except: - sys.exit("modify_ctm_edits.py: error opening segments output " - "file {0}".format(args.text_out)) - if args.ctm_edits_out != None: - try: - ctm_edits_output_handle = open(args.ctm_edits_out, 'w') - except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits output " - "file {0}".format(args.ctm_edits_out)) - - # Most of what we're doing in the lines below is splitting the input lines - # and grouping them per utterance, before giving them to ProcessUtterance() - # and then printing the modified lines. - first_line = f_in.readline() - if first_line == '': - sys.exit("modify_ctm_edits.py: empty input") - split_pending_line = first_line.split() - if len(split_pending_line) == 0: - sys.exit("modify_ctm_edits.py: bad input line " + first_line) - cur_utterance = split_pending_line[0] - split_lines_of_cur_utterance = [] - - while True: - if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance: - (segments_for_utterance, - deleted_segments_for_utterance) = GetSegmentsForUtterance(split_lines_of_cur_utterance) - AccWordStatsForUtterance(split_lines_of_cur_utterance, segments_for_utterance) - WriteSegmentsForUtterance(text_output_handle, segments_output_handle, - cur_utterance, segments_for_utterance) - if args.ctm_edits_out != None: - PrintDebugInfoForUtterance(ctm_edits_output_handle, - split_lines_of_cur_utterance, - segments_for_utterance, - deleted_segments_for_utterance) - split_lines_of_cur_utterance = [] - if len(split_pending_line) == 0: - break - else: - cur_utterance = split_pending_line[0] - - split_lines_of_cur_utterance.append(split_pending_line) - next_line = f_in.readline() - split_pending_line = next_line.split() - if len(split_pending_line) == 0: - if next_line != '': - sys.exit("modify_ctm_edits.py: got an empty or whitespace input line") - try: - text_output_handle.close() - segments_output_handle.close() - if args.ctm_edits_out != None: - ctm_edits_output_handle.close() - except: - sys.exit("modify_ctm_edits.py: error closing one or more outputs " - "(broken pipe or full disk?)") - - -def ReadNonScoredWords(non_scored_words_file): - global non_scored_words - try: - f = open(non_scored_words_file) - except: - sys.exit("modify_ctm_edits.py: error opening file: " - "--non-scored-words=" + non_scored_words_file) - for line in f.readlines(): - a = line.split() - if not len(line.split()) == 1: - sys.exit("modify_ctm_edits.py: bad line in non-scored-words " - "file {0}: {1}".format(non_scored_words_file, line)) - non_scored_words.add(a[0]) - f.close() - - - - -non_scored_words = set() -ReadNonScoredWords(args.non_scored_words_in) - -oov_symbol = None -if args.oov_symbol_file != None: - try: - with open(args.oov_symbol_file) as f: - line = f.readline() - assert len(line.split()) == 1 - oov_symbol = line.split()[0] - assert f.readline() == '' - except Exception as e: - sys.exit("segment_ctm_edits.py: error reading file --oov-symbol-file=" + - args.oov_symbol_file + ", error is: " + str(e)) -elif args.unk_padding != 0.0: - sys.exit("segment_ctm_edits.py: if the --unk-padding option is nonzero (which " - "it is by default, the --oov-symbol-file option must be supplied.") - -# segment_total_length and num_segments are maps from -# 'stage' strings; see AccumulateSegmentStats for details. -segment_total_length = defaultdict(int) -num_segments = defaultdict(int) -# the lambda expression below is an anonymous function that takes no arguments -# and returns the new list [0, 0]. -word_count_pair = defaultdict(lambda: [0, 0]) -num_utterances = 0 -num_utterances_without_segments = 0 -total_length_of_utterances = 0 - - -ProcessData() -PrintSegmentStats() -if args.word_stats_out != None: - PrintWordStats(args.word_stats_out) -if args.ctm_edits_out != None: - print("segment_ctm_edits.py: detailed utterance-level debug information " - "is in " + args.ctm_edits_out, file = sys.stderr) - diff --git a/egs/wsj/s5/steps/cleanup/taint_ctm_edits.py b/egs/wsj/s5/steps/cleanup/taint_ctm_edits.py deleted file mode 100755 index c763d7191a1..00000000000 --- a/egs/wsj/s5/steps/cleanup/taint_ctm_edits.py +++ /dev/null @@ -1,242 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2016 Vimal Manohar -# 2016 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -from __future__ import print_function -import sys, operator, argparse, os -from collections import defaultdict - -# This script reads and writes the 'ctm-edits' file that is -# produced by get_ctm_edits.py. -# -# It is to be applied after modify_ctm_edits.py. Its function is to add, in -# certain circumstances, an optional extra field with the word 'tainted' to the -# ctm-edits format, e.g an input line like: -# -# AJJacobs_2007P-0001605-0003029 1 0 0.09 1.0 sil -# might become: -# AJJacobs_2007P-0001605-0003029 1 0 0.09 1.0 sil tainted -# -# It also deletes certain lines, representing deletions, from the ctm (if they -# were next to taintable lines... their presence could then be inferred from the -# 'tainted' flag). -# -# You should interpret the 'tainted' flag as "we're not sure what's going on here; -# don't trust this." -# -# One of the problem this script is trying to solve is that if we have errors -# that are adjacent to silence or non-scored words -# it's not at all clear whether the silence or non-scored words were really such, -# or might have contained actual words. -# Also, if we have words in the reference that were realized as '' in the -# hypothesis, and they are adjacent to errors, it's almost always the case -# that the '' doesn't really correspond to the word in the reference, so -# we mark these as 'tainted'. -# -# The rule for tainting is quite simple; see the code. - - - -parser = argparse.ArgumentParser( - description = "This program modifies the ctm-edits format to identify " - "silence and 'fixed' non-scored-word lines, and lines where the hyp is " - " and the reference is a real but OOV word, where there is a relatively " - "high probability that something is going wrong so we shouldn't trust " - "this line. It adds the field 'tainted' to such " - "lines. Lines in the ctm representing deletions from the reference will " - "be removed if they have 'tainted' adjacent lines (since it won't be clear " - "where such reference words were really realized, if at all). " - "See comments at the top of the script for more information.") - -parser.add_argument("--verbose", type = int, default = 1, - choices=[0,1,2,3], - help = "Verbose level, higher = more verbose output") -parser.add_argument("ctm_edits_in", metavar = "", - help = "Filename of input ctm-edits file. " - "Use /dev/stdin for standard input.") -parser.add_argument("ctm_edits_out", metavar = "", - help = "Filename of output ctm-edits file. " - "Use /dev/stdout for standard output.") - -args = parser.parse_args() - - - -# This function is the core of the program, that does the tainting and -# removes some lines representing deletions. -# split_lines_of_utt is a list of lists, one per line, each containing the -# sequence of fields. Returns the same format of data after processing to add -# the 'tainted' field. Note: this function is destructive of its input; the -# input will not have the same value afterwards. -def ProcessUtterance(split_lines_of_utt): - global num_lines_of_type, num_tainted_lines, \ - num_del_lines_giving_taint, num_sub_lines_giving_taint, \ - num_ins_lines_giving_taint - - # work out whether each line is taintable [i.e. silence or fix or unk replacing - # real-word]. - taintable = [ False ] * len(split_lines_of_utt) - for i in range(len(split_lines_of_utt)): - edit_type = split_lines_of_utt[i][7] - if edit_type == 'sil' or edit_type == 'fix': - taintable[i] = True - elif edit_type == 'cor' and split_lines_of_utt[i][4] != split_lines_of_utt[i][6]: - # this is the case when replaces a real word that was out of - # the vocabulary; we mark it as correct because such words do - # translate to if we don't have a pronunciations. However we - # don't have good confidence that the alignments of such words are - # accurate if they are adjacent to errors. - taintable[i] = True - - - for i in range(len(split_lines_of_utt)): - edit_type = split_lines_of_utt[i][7] - num_lines_of_type[edit_type] += 1 - if edit_type == 'del' or edit_type == 'sub' or edit_type == 'ins': - tainted_an_adjacent_line = False - # First go backwards tainting lines - j = i - 1 - while j >= 0 and taintable[j]: - tainted_an_adjacent_line = True - if len(split_lines_of_utt[j]) == 8: - num_tainted_lines += 1 - split_lines_of_utt[j].append('tainted') - j -= 1 - # Next go forwards tainting lines - j = i + 1 - while j < len(split_lines_of_utt) and taintable[j]: - tainted_an_adjacent_line = True - if len(split_lines_of_utt[j]) == 8: - num_tainted_lines += 1 - split_lines_of_utt[j].append('tainted') - j += 1 - if tainted_an_adjacent_line: - if edit_type == 'del': - split_lines_of_utt[i][7] = 'remove-this-line' - num_del_lines_giving_taint += 1 - elif edit_type == 'sub': - num_sub_lines_giving_taint += 1 - else: - num_ins_lines_giving_taint += 1 - - new_split_lines_of_utt = [] - for i in range(len(split_lines_of_utt)): - if split_lines_of_utt[i][7] != 'remove-this-line': - new_split_lines_of_utt.append(split_lines_of_utt[i]) - return new_split_lines_of_utt - - -def ProcessData(): - try: - f_in = open(args.ctm_edits_in) - except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits input " - "file {0}".format(args.ctm_edits_in)) - try: - f_out = open(args.ctm_edits_out, 'w') - except: - sys.exit("modify_ctm_edits.py: error opening ctm-edits output " - "file {0}".format(args.ctm_edits_out)) - num_lines_processed = 0 - - - # Most of what we're doing in the lines below is splitting the input lines - # and grouping them per utterance, before giving them to ProcessUtterance() - # and then printing the modified lines. - first_line = f_in.readline() - if first_line == '': - sys.exit("modify_ctm_edits.py: empty input") - split_pending_line = first_line.split() - if len(split_pending_line) == 0: - sys.exit("modify_ctm_edits.py: bad input line " + first_line) - cur_utterance = split_pending_line[0] - split_lines_of_cur_utterance = [] - - while True: - if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance: - split_lines_of_cur_utterance = ProcessUtterance(split_lines_of_cur_utterance) - for split_line in split_lines_of_cur_utterance: - print(' '.join(split_line), file = f_out) - split_lines_of_cur_utterance = [] - if len(split_pending_line) == 0: - break - else: - cur_utterance = split_pending_line[0] - - split_lines_of_cur_utterance.append(split_pending_line) - next_line = f_in.readline() - split_pending_line = next_line.split() - if len(split_pending_line) == 0: - if next_line != '': - sys.exit("modify_ctm_edits.py: got an empty or whitespace input line") - try: - f_out.close() - except: - sys.exit("taint_ctm_edits.py: error closing ctm-edits output " - "(broken pipe or full disk?)") - -def PrintNonScoredStats(): - if args.verbose < 1: - return - if num_lines == 0: - print("modify_ctm_edits.py: processed no input.", file = sys.stderr) - num_lines_modified = sum(ref_change_stats.values()) - num_incorrect_lines = num_lines - num_correct_lines - percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines) - percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines); - percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines) - print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), " - "of which {2} were changed fixing the reference for non-scored words " - "({3}% of lines, or {4}% of incorrect lines)".format( - num_lines, percent_lines_incorrect, num_lines_modified, - percent_modified, percent_of_incorrect_modified), - file = sys.stderr) - - keys = sorted(ref_change_stats.keys(), reverse=True, - key = lambda x: ref_change_stats[x]) - num_keys_to_print = 40 if args.verbose >= 2 else 10 - - print("modify_ctm_edits.py: most common edits (as percentages " - "of all such edits) are:\n" + - ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified) - for k in keys[0:num_keys_to_print]])) - + '\n...'if num_keys_to_print < len(keys) else '', - file = sys.stderr) - - -def PrintStats(): - tot_lines = sum(num_lines_of_type.values()) - if args.verbose < 1 or tot_lines == 0: - return - print("taint_ctm_edits.py: processed {0} input lines, whose edit-types were: ".format(tot_lines) + - ', '.join([ '%s = %.2f%%' % (k, num_lines_of_type[k] * 100.0 / tot_lines) - for k in sorted(num_lines_of_type.keys(), reverse = True, - key = lambda k: num_lines_of_type[k]) ]), - file = sys.stderr) - - - del_giving_taint_percent = num_del_lines_giving_taint * 100.0 / tot_lines - sub_giving_taint_percent = num_sub_lines_giving_taint * 100.0 / tot_lines - ins_giving_taint_percent = num_ins_lines_giving_taint * 100.0 / tot_lines - tainted_lines_percent = num_tainted_lines * 100.0 / tot_lines - - print("taint_ctm_edits.py: as a percentage of all lines, (%.2f%%, %.2f%%, %.2f%%) were " - "(deletions, substitutions, insertions) that tainted adjacent lines. %.2f%% of all " - "lines were tainted." % (del_giving_taint_percent, sub_giving_taint_percent, - ins_giving_taint_percent, tainted_lines_percent), - file = sys.stderr) - - - -# num_lines_of_type will map from line-type ('cor', 'sub', etc.) to count. -num_lines_of_type = defaultdict(int) -num_tainted_lines = 0 -num_del_lines_giving_taint = 0 -num_sub_lines_giving_taint = 0 -num_ins_lines_giving_taint = 0 - -ProcessData() -PrintStats() - diff --git a/egs/wsj/s5/utils/lang/make_phone_lm.py b/egs/wsj/s5/utils/lang/make_phone_lm.py index 28ed7d3426b..47d2a45d229 100755 --- a/egs/wsj/s5/utils/lang/make_phone_lm.py +++ b/egs/wsj/s5/utils/lang/make_phone_lm.py @@ -151,7 +151,7 @@ def AddRawCountsFromLine(self, line): try: words = [self.bos_symbol] + [ int(x) for x in line.split() ] + [self.eos_symbol] except: - sys.exit("make_one_biased_lm.py: bad input line {0} (expected a sequence " + sys.exit("make_phone_lm.py: bad input line {0} (expected a sequence " "of integers)".format(line)) for n in range(1, len(words)): @@ -170,7 +170,7 @@ def AddRawCountsFromStandardInput(self): self.AddRawCountsFromLine(line) lines_processed += 1 if lines_processed == 0 or args.verbose > 0: - print("make_one_biased_lm.py: processed {0} lines of input".format( + print("make_phone_lm.py: processed {0} lines of input".format( lines_processed), file = sys.stderr)