diff --git a/egs/aishell2/s5/local/word_segmentation.py b/egs/aishell2/s5/local/word_segmentation.py index eb7bb648970..4ce55a2003e 100644 --- a/egs/aishell2/s5/local/word_segmentation.py +++ b/egs/aishell2/s5/local/word_segmentation.py @@ -4,6 +4,7 @@ # 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) # Apache 2.0 +from __future__ import print_function import sys import jieba reload(sys) diff --git a/egs/ami/s5/local/sort_bad_utts.py b/egs/ami/s5/local/sort_bad_utts.py index f84fcb12608..baabdc73508 100644 --- a/egs/ami/s5/local/sort_bad_utts.py +++ b/egs/ami/s5/local/sort_bad_utts.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +from __future__ import print_function import sys import argparse import logging @@ -38,10 +39,10 @@ def GetSortedWers(utt_info_file): utt_wer_sorted = sorted(utt_wer, key = lambda k : k[1]) try: import numpy as np - bins = range(0,105,5) + bins = list(range(0,105,5)) bins.append(sys.float_info.max) - hist, bin_edges = np.histogram(map(lambda x: x[1], utt_wer_sorted), + hist, bin_edges = np.histogram([x[1] for x in utt_wer_sorted], bins = bins) num_utts = len(utt_wer) string = '' diff --git a/egs/ami/s5/local/tfrnnlm/run_lstm.sh b/egs/ami/s5/local/tfrnnlm/run_lstm.sh index a298590a31d..d68fadb10f3 100755 --- a/egs/ami/s5/local/tfrnnlm/run_lstm.sh +++ b/egs/ami/s5/local/tfrnnlm/run_lstm.sh @@ -39,7 +39,7 @@ if [ $stage -le 3 ]; then decode_dir=${basedir}/decode_${decode_set} # Lattice rescoring - steps/lmrescore_rnnlm_lat.sh \ + steps/tfrnnlm/lmrescore_rnnlm_lat.sh \ --cmd "$tfrnnlm_cmd --mem 16G" \ --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ data/lang_$LM $dir \ diff --git a/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh b/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh index 15d237b0e12..7a95f38ba1e 100755 --- a/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh +++ b/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh @@ -39,7 +39,7 @@ if [ $stage -le 3 ]; then decode_dir=${basedir}/decode_${decode_set} # Lattice rescoring - steps/lmrescore_rnnlm_lat.sh \ + steps/tfrnnlm/lmrescore_rnnlm_lat.sh \ --cmd "$tfrnnlm_cmd --mem 16G" \ --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ data/lang_$LM $dir \ diff --git a/egs/an4/s5/local/data_prep.py b/egs/an4/s5/local/data_prep.py index 24cb9bffb07..9d8083f3b60 100644 --- a/egs/an4/s5/local/data_prep.py +++ b/egs/an4/s5/local/data_prep.py @@ -15,6 +15,7 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function import os import re import sys diff --git a/egs/an4/s5/local/lexicon_prep.py b/egs/an4/s5/local/lexicon_prep.py index 8d451daf869..3584fa86dfb 100644 --- a/egs/an4/s5/local/lexicon_prep.py +++ b/egs/an4/s5/local/lexicon_prep.py @@ -15,6 +15,7 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function import os import re import sys diff --git a/egs/aspire/s5/local/multi_condition/create_uniform_segments.py b/egs/aspire/s5/local/multi_condition/create_uniform_segments.py index e7baafc028c..010811490ef 100755 --- a/egs/aspire/s5/local/multi_condition/create_uniform_segments.py +++ b/egs/aspire/s5/local/multi_condition/create_uniform_segments.py @@ -4,13 +4,14 @@ # creates a segments file in the provided data directory # into uniform segments with specified window and overlap +from __future__ import division import imp, sys, argparse, os, math, subprocess min_segment_length = 10 # in seconds def segment(total_length, window_length, overlap = 0): increment = window_length - overlap num_windows = int(math.ceil(float(total_length)/increment)) - segments = map(lambda x: (x * increment, min( total_length, (x * increment) + window_length)), range(0, num_windows)) + segments = [(x * increment, min( total_length, (x * increment) + window_length)) for x in range(0, num_windows)] if segments[-1][1] - segments[-1][0] < min_segment_length: segments[-2] = (segments[-2][0], segments[-1][1]) segments.pop() @@ -53,7 +54,7 @@ def prepare_segments_file(kaldi_data_dir, window_length, overlap): parser = argparse.ArgumentParser() parser.add_argument('--window-length', type = float, default = 30.0, help = 'length of the window used to cut the segment') parser.add_argument('--overlap', type = float, default = 5.0, help = 'overlap of neighboring windows') - parser.add_argument('data_dir', type=str, help='directory such as data/train') + parser.add_argument('data_dir', help='directory such as data/train') params = parser.parse_args() diff --git a/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py b/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py index e249e54e5f6..2b4bcddda69 100755 --- a/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py +++ b/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py @@ -38,14 +38,14 @@ def fill_ctm(input_ctm_file, output_ctm_file, recording_names): sys.stderr.write(str(" ".join(sys.argv))) parser = argparse.ArgumentParser(usage) - parser.add_argument('input_ctm_file', type=str, help='ctm file for the recordings') - parser.add_argument('output_ctm_file', type=str, help='ctm file for the recordings') - parser.add_argument('recording_name_file', type=str, help='file with names of the recordings') + parser.add_argument('input_ctm_file', help='ctm file for the recordings') + parser.add_argument('output_ctm_file', help='ctm file for the recordings') + parser.add_argument('recording_name_file', help='file with names of the recordings') params = parser.parse_args() try: - file_names = map(lambda x: x.strip(), open("{0}".format(params.recording_name_file)).readlines()) + file_names = [x.strip() for x in open("{0}".format(params.recording_name_file)).readlines()] except IOError: raise Exception("Expected to find {0}".format(params.recording_name_file)) diff --git a/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py b/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py index cc06f58616a..1f06d3e7c3b 100755 --- a/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py +++ b/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py @@ -3,6 +3,7 @@ # script to generate the file_patterns of the AIR database # see load_air.m file in AIR db to understand the naming convention +from __future__ import print_function import sys, glob, re, os.path air_dir = sys.argv[1] @@ -45,4 +46,4 @@ file_patterns.append(file_pattern+" "+output_file_name) file_patterns = list(set(file_patterns)) file_patterns.sort() -print "\n".join(file_patterns) +print("\n".join(file_patterns)) diff --git a/egs/aspire/s5/local/multi_condition/normalize_wavs.py b/egs/aspire/s5/local/multi_condition/normalize_wavs.py index dabf420d9f8..6e67d2113c1 100755 --- a/egs/aspire/s5/local/multi_condition/normalize_wavs.py +++ b/egs/aspire/s5/local/multi_condition/normalize_wavs.py @@ -3,6 +3,8 @@ # normalizes the wave files provided in input file list with a common scaling factor # the common scaling factor is computed to 1/\sqrt(1/(total_samples) * \sum_i{\sum_j x_i(j)^2}) where total_samples is sum of all samples of all wavefiles. If the data is multi-channel then each channel is treated as a seperate wave files +from __future__ import division +from __future__ import print_function import argparse, scipy.io.wavfile, warnings, numpy as np, math def get_normalization_coefficient(file_list, is_rir, additional_scaling): @@ -29,7 +31,7 @@ def get_normalization_coefficient(file_list, is_rir, additional_scaling): assert(rate == sampling_rate) else: sampling_rate = rate - data = data / dtype_max_value + data = data/dtype_max_value if is_rir: # just count the energy of the direct impulse response # this is treated as energy of signal from 0.001 seconds before impulse @@ -55,8 +57,8 @@ def get_normalization_coefficient(file_list, is_rir, additional_scaling): except IOError: warnings.warn("Did not find the file {0}.".format(file)) assert(total_samples > 0) - scaling_coefficient = np.sqrt(total_samples / total_energy) - print "Scaling coefficient is {0}.".format(scaling_coefficient) + scaling_coefficient = np.sqrt(total_samples/total_energy) + print("Scaling coefficient is {0}.".format(scaling_coefficient)) if math.isnan(scaling_coefficient): raise Exception(" Nan encountered while computing scaling coefficient. This is mostly due to numerical overflow") return scaling_coefficient diff --git a/egs/aspire/s5/local/multi_condition/read_rir.py b/egs/aspire/s5/local/multi_condition/read_rir.py index a2e1c2052e2..04898bda760 100755 --- a/egs/aspire/s5/local/multi_condition/read_rir.py +++ b/egs/aspire/s5/local/multi_condition/read_rir.py @@ -29,9 +29,9 @@ def usage(): #sys.stderr.write(" ".join(sys.argv)+"\n") parser = argparse.ArgumentParser(usage()) parser.add_argument('--output-sampling-rate', type = int, default = 8000, help = 'sampling rate of the output') - parser.add_argument('type', type = str, default = None, help = 'database type', choices = ['air']) - parser.add_argument('input', type = str, default = None, help = 'directory containing the multi-channel data for a particular recording, or file name or file-regex-pattern') - parser.add_argument('output_filename', type = str, default = None, help = 'output filename (if "-" then output is written to output pipe)') + parser.add_argument('type', default = None, help = 'database type', choices = ['air']) + parser.add_argument('input', default = None, help = 'directory containing the multi-channel data for a particular recording, or file name or file-regex-pattern') + parser.add_argument('output_filename', default = None, help = 'output filename (if "-" then output is written to output pipe)') params = parser.parse_args() if params.output_filename == "-": diff --git a/egs/aspire/s5/local/multi_condition/reverberate_wavs.py b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py index 998a3ed5e74..f43e4a2f894 100755 --- a/egs/aspire/s5/local/multi_condition/reverberate_wavs.py +++ b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py @@ -4,18 +4,20 @@ # script to generate multicondition training data / dev data / test data import argparse, glob, math, os, random, scipy.io.wavfile, sys -class list_cyclic_iterator: +class list_cyclic_iterator(object): def __init__(self, list, random_seed = 0): self.list_index = 0 self.list = list random.seed(random_seed) random.shuffle(self.list) - def next(self): + def __next__(self): item = self.list[self.list_index] self.list_index = (self.list_index + 1) % len(self.list) return item + next = __next__ # for Python 2 + def return_nonempty_lines(lines): new_lines = [] for line in lines: @@ -71,15 +73,15 @@ def return_nonempty_lines(lines): for i in range(len(wav_files)): wav_file = " ".join(wav_files[i].split()[1:]) output_wav_file = wav_out_files[i] - impulse_file = impulses.next() + impulse_file = next(impulses) noise_file = '' snr = '' found_impulse = False if add_noise: - for i in xrange(len(impulse_noise_index)): + for i in range(len(impulse_noise_index)): if impulse_file in impulse_noise_index[i][0]: - noise_file = impulse_noise_index[i][1].next() - snr = snrs.next() + noise_file = next(impulse_noise_index[i][1]) + snr = next(snrs) assert(len(wav_file.strip()) > 0) assert(len(impulse_file.strip()) > 0) assert(len(noise_file.strip()) > 0) diff --git a/egs/babel/s5b/local/lonestar.py b/egs/babel/s5b/local/lonestar.py index e1594e55ada..809f99b22cf 100755 --- a/egs/babel/s5b/local/lonestar.py +++ b/egs/babel/s5b/local/lonestar.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function from pylauncher import * import pylauncher import sys @@ -39,7 +40,7 @@ def KaldiLauncher(lo, **kwargs): logfiles = list() commands = list() - for q in xrange(lo.jobstart, lo.jobend+1): + for q in range(lo.jobstart, lo.jobend+1): s = "bash " + lo.queue_scriptfile + " " + str(q) commands.append(s) @@ -74,7 +75,7 @@ def KaldiLauncher(lo, **kwargs): time.sleep(delay); lines=tail(10, logfile) - with_status=filter(lambda x:re.search(r'with status (\d+)', x), lines) + with_status=[x for x in lines if re.search(r'with status (\d+)', x)] if len(with_status) == 0: sys.stderr.write("The last line(s) of the log-file " + logfile + " does not seem" @@ -98,7 +99,7 @@ def KaldiLauncher(lo, **kwargs): sys.exit(-1); #Remove service files. Be careful not to remove something that might be needed in problem diagnostics - for i in xrange(len(commands)): + for i in range(len(commands)): out_file=os.path.join(qdir, ce.outstring+str(i)) #First, let's wait on files missing (it might be that those are missing @@ -149,7 +150,7 @@ def KaldiLauncher(lo, **kwargs): #print job.final_report() -class LauncherOpts: +class LauncherOpts(object): def __init__(self): self.sync=0 self.nof_threads = 1 @@ -199,7 +200,7 @@ def CmdLineParser(argv): jobend=int(m.group(2)) argv.pop(0) elif re.match("^.+=.*:.*$", argv[0]): - print >> sys.stderr, "warning: suspicious JOB argument " + argv[0]; + print("warning: suspicious JOB argument " + argv[0], file=sys.stderr); if jobstart > jobend: sys.stderr.write("lonestar.py: JOBSTART("+ str(jobstart) + ") must be lower than JOBEND(" + str(jobend) + ")\n") @@ -238,8 +239,8 @@ def setup_paths_and_vars(opts): cwd = os.getcwd() if opts.varname and (opts.varname not in opts.logfile ) and (opts.jobstart != opts.jobend): - print >>sys.stderr, "lonestar.py: you are trying to run a parallel job" \ - "but you are putting the output into just one log file (" + opts.logfile + ")"; + print("lonestar.py: you are trying to run a parallel job" \ + "but you are putting the output into just one log file (" + opts.logfile + ")", file=sys.stderr); sys.exit(1) if not os.path.isabs(opts.logfile): @@ -261,8 +262,8 @@ def setup_paths_and_vars(opts): taskname=os.path.basename(queue_logfile) taskname = taskname.replace(".log", ""); if taskname == "": - print >> sys.stderr, "lonestar.py: you specified the log file name in such form " \ - "that leads to an empty task name ("+logfile + ")"; + print("lonestar.py: you specified the log file name in such form " \ + "that leads to an empty task name ("+logfile + ")", file=sys.stderr); sys.exit(1) if not os.path.isabs(queue_logfile): diff --git a/egs/babel/s5b/local/resegment/segmentation.py b/egs/babel/s5b/local/resegment/segmentation.py index 7c5c8665a16..aed65a4ca14 100755 --- a/egs/babel/s5b/local/resegment/segmentation.py +++ b/egs/babel/s5b/local/resegment/segmentation.py @@ -3,6 +3,7 @@ # Copyright 2014 Vimal Manohar # Apache 2.0 +from __future__ import division import os, glob, argparse, sys, re, time from argparse import ArgumentParser @@ -19,12 +20,12 @@ def mean(l): if len(l) > 0: - return float(sum(l)) / len(l) + return (float(sum(l))/len(l)) return 0 # Analysis class # Stores statistics like the confusion matrix, length of the segments etc. -class Analysis: +class Analysis(object): def __init__(self, file_id, frame_shift, prefix): self.confusion_matrix = [0] * 9 self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ] @@ -274,8 +275,8 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): i = len(this_file) category = splits[6] word = splits[5] - start_time = int(float(splits[3])/frame_shift + 0.5) - duration = int(float(splits[4])/frame_shift + 0.5) + start_time = int((float(splits[3])/frame_shift) + 0.5) + duration = int((float(splits[4])/frame_shift) + 0.5) if i < start_time: this_file.extend(["0"]*(start_time - i)) if type1 == "NON-LEX": @@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): # Stats class to store some basic stats about the number of # times the post-processor goes through particular loops or blocks # of code in the algorithm. This is just for debugging. -class Stats: +class Stats(object): def __init__(self): self.inter_utt_nonspeech = 0 self.merge_nonspeech_segment = 0 @@ -321,7 +322,7 @@ def reset(self): self.noise_only = 0 # Timer class to time functions -class Timer: +class Timer(object): def __enter__(self): self.start = time.clock() return self @@ -332,7 +333,7 @@ def __exit__(self, *args): # The main class for post-processing a file. # This does the segmentation either looking at the file isolated # or by looking at both classes simultaneously -class JointResegmenter: +class JointResegmenter(object): def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): # Pointers to prediction arrays and Initialization @@ -1290,22 +1291,22 @@ def main(): dest='hard_max_segment_length', default=15.0, \ help="Hard maximum on the segment length above which the segment " \ + "will be broken even if in the middle of speech (default: %(default)s)") - parser.add_argument('--first-separator', type=str, \ + parser.add_argument('--first-separator', \ dest='first_separator', default="-", \ help="Separator between recording-id and start-time (default: %(default)s)") - parser.add_argument('--second-separator', type=str, \ + parser.add_argument('--second-separator', \ dest='second_separator', default="-", \ help="Separator between start-time and end-time (default: %(default)s)") - parser.add_argument('--remove-noise-only-segments', type=str, \ + parser.add_argument('--remove-noise-only-segments', \ dest='remove_noise_only_segments', default="true", choices=("true", "false"), \ help="Remove segments that have only noise. (default: %(default)s)") parser.add_argument('--min-inter-utt-silence-length', type=float, \ dest='min_inter_utt_silence_length', default=1.0, \ help="Minimum silence that must exist between two separate utterances (default: %(default)s)"); - parser.add_argument('--channel1-file', type=str, \ + parser.add_argument('--channel1-file', \ dest='channel1_file', default="inLine", \ help="String that matches with the channel 1 file (default: %(default)s)") - parser.add_argument('--channel2-file', type=str, \ + parser.add_argument('--channel2-file', \ dest='channel2_file', default="outLine", \ help="String that matches with the channel 2 file (default: %(default)s)") parser.add_argument('--isolated-resegmentation', \ @@ -1388,7 +1389,7 @@ def main(): speech_cap = None if options.speech_cap_length != None: - speech_cap = int( options.speech_cap_length / options.frame_shift ) + speech_cap = int(options.speech_cap_length/options.frame_shift) # End if for f in pred_files: @@ -1454,7 +1455,7 @@ def main(): f2 = f3 # End if - if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift: + if (len(A1) - len(A2)) > int(options.max_length_diff/options.frame_shift): sys.stderr.write( \ "%s: Warning: Lengths of %s and %s differ by more than %f. " \ % (sys.argv[0], f1,f2, options.max_length_diff) \ diff --git a/egs/babel/s5c/local/lonestar.py b/egs/babel/s5c/local/lonestar.py index e1594e55ada..809f99b22cf 100755 --- a/egs/babel/s5c/local/lonestar.py +++ b/egs/babel/s5c/local/lonestar.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function from pylauncher import * import pylauncher import sys @@ -39,7 +40,7 @@ def KaldiLauncher(lo, **kwargs): logfiles = list() commands = list() - for q in xrange(lo.jobstart, lo.jobend+1): + for q in range(lo.jobstart, lo.jobend+1): s = "bash " + lo.queue_scriptfile + " " + str(q) commands.append(s) @@ -74,7 +75,7 @@ def KaldiLauncher(lo, **kwargs): time.sleep(delay); lines=tail(10, logfile) - with_status=filter(lambda x:re.search(r'with status (\d+)', x), lines) + with_status=[x for x in lines if re.search(r'with status (\d+)', x)] if len(with_status) == 0: sys.stderr.write("The last line(s) of the log-file " + logfile + " does not seem" @@ -98,7 +99,7 @@ def KaldiLauncher(lo, **kwargs): sys.exit(-1); #Remove service files. Be careful not to remove something that might be needed in problem diagnostics - for i in xrange(len(commands)): + for i in range(len(commands)): out_file=os.path.join(qdir, ce.outstring+str(i)) #First, let's wait on files missing (it might be that those are missing @@ -149,7 +150,7 @@ def KaldiLauncher(lo, **kwargs): #print job.final_report() -class LauncherOpts: +class LauncherOpts(object): def __init__(self): self.sync=0 self.nof_threads = 1 @@ -199,7 +200,7 @@ def CmdLineParser(argv): jobend=int(m.group(2)) argv.pop(0) elif re.match("^.+=.*:.*$", argv[0]): - print >> sys.stderr, "warning: suspicious JOB argument " + argv[0]; + print("warning: suspicious JOB argument " + argv[0], file=sys.stderr); if jobstart > jobend: sys.stderr.write("lonestar.py: JOBSTART("+ str(jobstart) + ") must be lower than JOBEND(" + str(jobend) + ")\n") @@ -238,8 +239,8 @@ def setup_paths_and_vars(opts): cwd = os.getcwd() if opts.varname and (opts.varname not in opts.logfile ) and (opts.jobstart != opts.jobend): - print >>sys.stderr, "lonestar.py: you are trying to run a parallel job" \ - "but you are putting the output into just one log file (" + opts.logfile + ")"; + print("lonestar.py: you are trying to run a parallel job" \ + "but you are putting the output into just one log file (" + opts.logfile + ")", file=sys.stderr); sys.exit(1) if not os.path.isabs(opts.logfile): @@ -261,8 +262,8 @@ def setup_paths_and_vars(opts): taskname=os.path.basename(queue_logfile) taskname = taskname.replace(".log", ""); if taskname == "": - print >> sys.stderr, "lonestar.py: you specified the log file name in such form " \ - "that leads to an empty task name ("+logfile + ")"; + print("lonestar.py: you specified the log file name in such form " \ + "that leads to an empty task name ("+logfile + ")", file=sys.stderr); sys.exit(1) if not os.path.isabs(queue_logfile): diff --git a/egs/babel/s5c/local/resegment/segmentation.py b/egs/babel/s5c/local/resegment/segmentation.py index 7c5c8665a16..4bdb0fea75c 100755 --- a/egs/babel/s5c/local/resegment/segmentation.py +++ b/egs/babel/s5c/local/resegment/segmentation.py @@ -3,6 +3,7 @@ # Copyright 2014 Vimal Manohar # Apache 2.0 +from __future__ import division import os, glob, argparse, sys, re, time from argparse import ArgumentParser @@ -19,12 +20,12 @@ def mean(l): if len(l) > 0: - return float(sum(l)) / len(l) + return (float(sum(l))/len(l)) return 0 # Analysis class # Stores statistics like the confusion matrix, length of the segments etc. -class Analysis: +class Analysis(object): def __init__(self, file_id, frame_shift, prefix): self.confusion_matrix = [0] * 9 self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ] @@ -274,7 +275,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): i = len(this_file) category = splits[6] word = splits[5] - start_time = int(float(splits[3])/frame_shift + 0.5) + start_time = int((float(splits[3])/frame_shift) + 0.5) duration = int(float(splits[4])/frame_shift + 0.5) if i < start_time: this_file.extend(["0"]*(start_time - i)) @@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): # Stats class to store some basic stats about the number of # times the post-processor goes through particular loops or blocks # of code in the algorithm. This is just for debugging. -class Stats: +class Stats(object): def __init__(self): self.inter_utt_nonspeech = 0 self.merge_nonspeech_segment = 0 @@ -321,7 +322,7 @@ def reset(self): self.noise_only = 0 # Timer class to time functions -class Timer: +class Timer(object): def __enter__(self): self.start = time.clock() return self @@ -332,7 +333,7 @@ def __exit__(self, *args): # The main class for post-processing a file. # This does the segmentation either looking at the file isolated # or by looking at both classes simultaneously -class JointResegmenter: +class JointResegmenter(object): def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): # Pointers to prediction arrays and Initialization @@ -351,9 +352,9 @@ def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): self.frame_shift = options.frame_shift # Convert length in seconds to frames - self.max_frames = int(options.max_segment_length / options.frame_shift) - self.hard_max_frames = int(options.hard_max_segment_length / options.frame_shift) - self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length / options.frame_shift) + self.max_frames = int(options.max_segment_length/options.frame_shift) + self.hard_max_frames = int(options.hard_max_segment_length/options.frame_shift) + self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length, options.frame_shift) if ( options.remove_noise_only_segments == "false" ): self.remove_noise_segments = False elif ( options.remove_noise_only_segments == "true" ): @@ -540,7 +541,7 @@ def set_nonspeech_proportion(self): # Set the number of non-speech frames to be added depending on the # silence proportion. The target number of frames in the segments # is computed as below: - target_segment_frames = int(num_speech_frames / (1.0 - self.options.silence_proportion)) + target_segment_frames = int(num_speech_frames/(1.0 - self.options.silence_proportion)) # The number of frames currently in the segments num_segment_frames = num_speech_frames @@ -599,7 +600,7 @@ def set_nonspeech_proportion(self): if not changed: # avoid an infinite loop. if no changes, then break. break if num_segment_frames < target_segment_frames: - proportion = float(num_segment_frames - num_speech_frames) / num_segment_frames + proportion = float(num_segment_frames - num_speech_frames)/num_segment_frames sys.stderr.write("%s: Warning: for recording %s, only got a proportion %f of non-speech frames, versus target %f\n" % (sys.argv[0], self.file_id, proportion, self.options.silence_proportion)) ########################################################################### @@ -863,14 +864,14 @@ def split_long_segments(self): # Count the number of times long segments are split self.stats.split_segments += 1 - num_pieces = int((float(segment_length) / self.hard_max_frames) + 0.99999) + num_pieces = int((float(segment_length)/self.hard_max_frames) + 0.99999) sys.stderr.write("%s: Warning: for recording %s, " \ % (sys.argv[0], self.file_id) \ + "splitting segment of length %f seconds into %d pieces " \ % (segment_length * self.frame_shift, num_pieces) \ + "(--hard-max-segment-length %f)\n" \ % self.options.hard_max_segment_length) - frames_per_piece = int(segment_length / num_pieces) + frames_per_piece = int(segment_length/num_pieces) for i in range(1,num_pieces): q = n + i * frames_per_piece self.S[q] = True @@ -1290,22 +1291,22 @@ def main(): dest='hard_max_segment_length', default=15.0, \ help="Hard maximum on the segment length above which the segment " \ + "will be broken even if in the middle of speech (default: %(default)s)") - parser.add_argument('--first-separator', type=str, \ + parser.add_argument('--first-separator', \ dest='first_separator', default="-", \ help="Separator between recording-id and start-time (default: %(default)s)") - parser.add_argument('--second-separator', type=str, \ + parser.add_argument('--second-separator', \ dest='second_separator', default="-", \ help="Separator between start-time and end-time (default: %(default)s)") - parser.add_argument('--remove-noise-only-segments', type=str, \ + parser.add_argument('--remove-noise-only-segments', \ dest='remove_noise_only_segments', default="true", choices=("true", "false"), \ help="Remove segments that have only noise. (default: %(default)s)") parser.add_argument('--min-inter-utt-silence-length', type=float, \ dest='min_inter_utt_silence_length', default=1.0, \ help="Minimum silence that must exist between two separate utterances (default: %(default)s)"); - parser.add_argument('--channel1-file', type=str, \ + parser.add_argument('--channel1-file', \ dest='channel1_file', default="inLine", \ help="String that matches with the channel 1 file (default: %(default)s)") - parser.add_argument('--channel2-file', type=str, \ + parser.add_argument('--channel2-file', \ dest='channel2_file', default="outLine", \ help="String that matches with the channel 2 file (default: %(default)s)") parser.add_argument('--isolated-resegmentation', \ @@ -1388,7 +1389,7 @@ def main(): speech_cap = None if options.speech_cap_length != None: - speech_cap = int( options.speech_cap_length / options.frame_shift ) + speech_cap = int(options.speech_cap_length/options.frame_shift) # End if for f in pred_files: @@ -1454,7 +1455,7 @@ def main(): f2 = f3 # End if - if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift: + if (len(A1) - len(A2)) > int(options.max_length_diff/options.frame_shift): sys.stderr.write( \ "%s: Warning: Lengths of %s and %s differ by more than %f. " \ % (sys.argv[0], f1,f2, options.max_length_diff) \ diff --git a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py index 68280762597..91419f6e920 100755 --- a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py +++ b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py @@ -106,6 +106,7 @@ # Import Statements from __future__ import print_function +from __future__ import division import codecs import argparse import unicodedata @@ -340,7 +341,7 @@ def encode(unicode_transcription, tag_percentage, log=False): int2graph = {v: k for k, v in graph2int.items()} graph_list_int = [graph2int[g] for g in graph_list] bin_edges = range(0, len(int2graph.keys()) + 1) - graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0] / float(len(graph_list_int)) + graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0]/float(len(graph_list_int)) # Set count threshold to frequency that tags the bottom 10% of graphemes bottom_idx = int(np.floor(tag_percentage * len(graph_counts))) count_thresh = sorted(graph_counts)[bottom_idx] @@ -465,7 +466,7 @@ def encode(unicode_transcription, tag_percentage, log=False): for g_dict in table: g_map = "" map_number = 0 - for g_field, g_val in sorted(g_dict.iteritems()): + for g_field, g_val in sorted(g_dict.items()): if(g_field == ("MAP" + str(map_number))): g_map = g_map + g_val + " " map_number = map_number + 1 @@ -561,7 +562,7 @@ def write_table(table, outfile): # Start writing to output with codecs.open(outfile, "w", "utf-8") as fo: # Get header names - header_names = sorted(set().union(*[d.keys() for d in table])) + header_names = sorted(set().union(*[list(d.keys()) for d in table])) # Write headers for h in header_names[:-1]: fo.write("%s\t" % h) @@ -595,7 +596,7 @@ def write_map(grapheme_map, mapfile): ''' with codecs.open(mapfile, 'w', encoding='utf-8') as f: - for g, g_map in grapheme_map.iteritems(): + for g, g_map in grapheme_map.items(): print(g, g_map, file=f) @@ -613,14 +614,14 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None, with codecs.open(outfile, "w", "utf-8") as f: # First write the non-speech words try: - for w in sil_lex.iterkeys(): + for w in sil_lex.keys(): f.write("%s\t%s\n" % (w, sil_lex[w])) except AttributeError: pass # Then write extra-speech words try: - for w in extra_lex.iterkeys(): + for w in extra_lex.keys(): f.write("%s\t%s\n" % (w, extra_lex[w])) except AttributeError: pass @@ -629,9 +630,9 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None, for idx, w in enumerate(baseforms): # This is really just for BABEL in case is written as a word if(w[0].lower() == ""): - f.write("%s\t\n" % (unicode(w[0]))) + f.write("%s\t\n" % (w[0])) else: - f.write("%s\t%s\n" % (unicode(w[0]), + f.write("%s\t%s\n" % (w[0], encoded_transcription[idx])) if __name__ == "__main__": diff --git a/egs/babel/s5d/local/lexicon/make_word_list.py b/egs/babel/s5d/local/lexicon/make_word_list.py index 9a9e17f6c60..c1473b8ced8 100755 --- a/egs/babel/s5d/local/lexicon/make_word_list.py +++ b/egs/babel/s5d/local/lexicon/make_word_list.py @@ -85,7 +85,7 @@ def main(): # Print the word list with codecs.open(args.word_list, "w", encoding="utf-8") as f: for word, count in words: - f.write("%d %s\n" % (count, unicode(word))) + f.write("%d %s\n" % (count, word)) if args.misprons is not None: with codecs.open(args.misprons, "w", encoding="utf-8") as f: diff --git a/egs/babel/s5d/local/prepare_unicode_lexicon.py b/egs/babel/s5d/local/prepare_unicode_lexicon.py index 86fa4d60ba1..3b9dc1abd86 100755 --- a/egs/babel/s5d/local/prepare_unicode_lexicon.py +++ b/egs/babel/s5d/local/prepare_unicode_lexicon.py @@ -89,7 +89,7 @@ def extract_phonemes(lexicon): # Read all baseform units into dictionary with {a: [a, a_1, a_2], # b: [b_1, b_3], ...} phonemes_dict = {} - for word, pron in lexicon.iteritems(): + for word, pron in lexicon.items(): for p in pron.split(): try: base = p.split("_",1)[0] @@ -98,11 +98,11 @@ def extract_phonemes(lexicon): phonemes_dict[base] = [p] # Makes sure there are no repeats in the list - phonemes_dict = {k: set(v) for k, v in phonemes_dict.iteritems()} + phonemes_dict = {k: set(v) for k, v in phonemes_dict.items()} # Get all unique phonemes phonemes = [] - for v in phonemes_dict.itervalues(): + for v in phonemes_dict.values(): for p in v: phonemes.append(p) @@ -137,11 +137,11 @@ def write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict, # Write all possible phone_tag combinations that occur in the lexicon for tag in tags: - for p in nonsil_phonemes_dict.iterkeys(): + for p in nonsil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in nonsil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) - for p in sil_phonemes_dict.iterkeys(): + for p in sil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in sil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) diff --git a/egs/babel/s5d/local/resegment/segmentation.py b/egs/babel/s5d/local/resegment/segmentation.py index 7c5c8665a16..02fd7646b96 100755 --- a/egs/babel/s5d/local/resegment/segmentation.py +++ b/egs/babel/s5d/local/resegment/segmentation.py @@ -3,6 +3,7 @@ # Copyright 2014 Vimal Manohar # Apache 2.0 +from __future__ import division import os, glob, argparse, sys, re, time from argparse import ArgumentParser @@ -19,12 +20,12 @@ def mean(l): if len(l) > 0: - return float(sum(l)) / len(l) + return float(sum(l))/len(l) return 0 # Analysis class # Stores statistics like the confusion matrix, length of the segments etc. -class Analysis: +class Analysis(object): def __init__(self, file_id, frame_shift, prefix): self.confusion_matrix = [0] * 9 self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ] @@ -274,8 +275,8 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): i = len(this_file) category = splits[6] word = splits[5] - start_time = int(float(splits[3])/frame_shift + 0.5) - duration = int(float(splits[4])/frame_shift + 0.5) + start_time = int((float(splits[3])/frame_shift) + 0.5) + duration = int((float(splits[4])/frame_shift) + 0.5) if i < start_time: this_file.extend(["0"]*(start_time - i)) if type1 == "NON-LEX": @@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift): # Stats class to store some basic stats about the number of # times the post-processor goes through particular loops or blocks # of code in the algorithm. This is just for debugging. -class Stats: +class Stats(object): def __init__(self): self.inter_utt_nonspeech = 0 self.merge_nonspeech_segment = 0 @@ -321,7 +322,7 @@ def reset(self): self.noise_only = 0 # Timer class to time functions -class Timer: +class Timer(object): def __enter__(self): self.start = time.clock() return self @@ -332,7 +333,7 @@ def __exit__(self, *args): # The main class for post-processing a file. # This does the segmentation either looking at the file isolated # or by looking at both classes simultaneously -class JointResegmenter: +class JointResegmenter(object): def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): # Pointers to prediction arrays and Initialization @@ -351,8 +352,8 @@ def __init__(self, P, A, f, options, phone_map, stats = None, reference = None): self.frame_shift = options.frame_shift # Convert length in seconds to frames - self.max_frames = int(options.max_segment_length / options.frame_shift) - self.hard_max_frames = int(options.hard_max_segment_length / options.frame_shift) + self.max_frames = int(options.max_segment_length/options.frame_shift) + self.hard_max_frames = int(options.hard_max_segment_length/options.frame_shift) self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length / options.frame_shift) if ( options.remove_noise_only_segments == "false" ): self.remove_noise_segments = False @@ -540,7 +541,7 @@ def set_nonspeech_proportion(self): # Set the number of non-speech frames to be added depending on the # silence proportion. The target number of frames in the segments # is computed as below: - target_segment_frames = int(num_speech_frames / (1.0 - self.options.silence_proportion)) + target_segment_frames = int(num_speech_frames/(1.0 - self.options.silence_proportion)) # The number of frames currently in the segments num_segment_frames = num_speech_frames @@ -599,7 +600,7 @@ def set_nonspeech_proportion(self): if not changed: # avoid an infinite loop. if no changes, then break. break if num_segment_frames < target_segment_frames: - proportion = float(num_segment_frames - num_speech_frames) / num_segment_frames + proportion = float(num_segment_frames - num_speech_frames)/ num_segment_frames sys.stderr.write("%s: Warning: for recording %s, only got a proportion %f of non-speech frames, versus target %f\n" % (sys.argv[0], self.file_id, proportion, self.options.silence_proportion)) ########################################################################### @@ -863,14 +864,14 @@ def split_long_segments(self): # Count the number of times long segments are split self.stats.split_segments += 1 - num_pieces = int((float(segment_length) / self.hard_max_frames) + 0.99999) + num_pieces = int((float(segment_length)/self.hard_max_frames) + 0.99999) sys.stderr.write("%s: Warning: for recording %s, " \ % (sys.argv[0], self.file_id) \ + "splitting segment of length %f seconds into %d pieces " \ % (segment_length * self.frame_shift, num_pieces) \ + "(--hard-max-segment-length %f)\n" \ % self.options.hard_max_segment_length) - frames_per_piece = int(segment_length / num_pieces) + frames_per_piece = int(segment_length/num_pieces) for i in range(1,num_pieces): q = n + i * frames_per_piece self.S[q] = True @@ -1388,7 +1389,7 @@ def main(): speech_cap = None if options.speech_cap_length != None: - speech_cap = int( options.speech_cap_length / options.frame_shift ) + speech_cap = int(options.speech_cap_length/options.frame_shift) # End if for f in pred_files: @@ -1454,7 +1455,7 @@ def main(): f2 = f3 # End if - if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift: + if (len(A1) - len(A2)) > options.max_length_diff/options.frame_shift: sys.stderr.write( \ "%s: Warning: Lengths of %s and %s differ by more than %f. " \ % (sys.argv[0], f1,f2, options.max_length_diff) \ diff --git a/egs/bentham/v1/local/gen_topo.py b/egs/bentham/v1/local/gen_topo.py index 540bfbcf270..af9e20317d8 100755 --- a/egs/bentham/v1/local/gen_topo.py +++ b/egs/bentham/v1/local/gen_topo.py @@ -9,6 +9,7 @@ # the number of states for other characters. from __future__ import print_function +from __future__ import division import argparse import string @@ -19,11 +20,11 @@ parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); -parser.add_argument("nonsilence_phones", type=str, +parser.add_argument("nonsilence_phones", help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); -parser.add_argument("silence_phones", type=str, +parser.add_argument("silence_phones", help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); -parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number."); +parser.add_argument("phone_list", help="file containing all phones and their corresponding number."); args = parser.parse_args() @@ -47,8 +48,8 @@ print("") for x in range(0, args.num_nonsil_states): xp1 = x + 1 - print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") -print(" " + str(args.num_nonsil_states) + " ") + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(x, xp1)) +print(" {} ".format(args.num_nonsil_states)) print("") # For nonsilence phones that ar punctuations @@ -58,8 +59,8 @@ print("") for x in range(0, args.num_punctuation_states): xp1 = x + 1 - print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") -print(" " + str(args.num_punctuation_states) + " ") + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(x, xp1)) +print(" {} ".format(args.num_punctuation_states)) print("") # For silence phones @@ -68,25 +69,25 @@ print(" ".join([str(x) for x in silence_phones])) print("") if(args.num_sil_states > 1): - transp = 1.0 / (args.num_sil_states - 1) + transp = 1.0/(args.num_sil_states - 1) state_str = " 0 0 " for x in range(0, (args.num_sil_states - 1)): - state_str = state_str + " " + str(x) + " " + str(transp) + " " + state_str = "{} {} {} ".format(state_str, x, transp) state_str = state_str + "" print(state_str) for x in range(1, (args.num_sil_states - 1)): - state_str = " " + str(x) + " " + str(x) + " " + state_str = " {0} {0} ".format(x) for y in range(1, args.num_sil_states): - state_str = state_str + " " + str(y) + " " + str(transp) + " " + state_str = "{} {} {} ".format(state_str, y, transp) state_str = state_str + "" print(state_str) second_last = args.num_sil_states - 1 - print(" " + str(second_last) + " " + str(second_last) + " " + str(second_last) + " 0.75 " + str(args.num_sil_states) + " 0.25 ") - print(" " + str(args.num_sil_states) + " ") + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(second_last, args.num_sil_states)) + print(" {} ".format(args.num_sil_states)) else: print(" 0 0 0 0.75 1 0.25 ") - print(" " + str(args.num_sil_states) + " ") + print(" {} ".format(args.num_sil_states)) print("") print("") diff --git a/egs/bn_music_speech/v1/local/make_annotations_bn.py b/egs/bn_music_speech/v1/local/make_annotations_bn.py index 53cebf52ea4..86bec7b16ae 100755 --- a/egs/bn_music_speech/v1/local/make_annotations_bn.py +++ b/egs/bn_music_speech/v1/local/make_annotations_bn.py @@ -9,6 +9,7 @@ # # This file is meant to be invoked by make_bn.sh. +from __future__ import print_function import sys, re, os def is_speech(line): @@ -37,7 +38,7 @@ def extract_speech(line): m = re.search('(?<=E_time=)\d+.\d+', line) end = float(m.group(0)) if start > end: - print "Skipping annotation where end time is before start time:", line + print("Skipping annotation where end time is before start time: {}".format(line)) return start, end def extract_other_type2(line): @@ -46,7 +47,7 @@ def extract_other_type2(line): m = re.search('(?<=E_time=)\d+.\d+', line) end = float(m.group(0)) if start > end: - print "Skipping annotation where end time is before start time:", line + print("Skipping annotation where end time is before start time: {}".format(line)) return start, end def extract_music(line): @@ -60,7 +61,7 @@ def extract_music(line): elif level == "O": is_on = False else: - print "Encountered bad token on line:", line + print("Encountered bad token on line: {}".format(line)) sys.exit() return time, is_on @@ -75,7 +76,7 @@ def extract_other_type1(line): elif level == "O": is_on = False else: - print "Encountered bad token on line:", line + print("Encountered bad token on line: {}".format(line)) sys.exit() return time, is_on @@ -92,11 +93,11 @@ def process_file(annos): for line in annos: if is_speech(line): speech_start, speech_end = extract_speech(line) - speech = speech + str(speech_start) + " " + str(speech_end) + "\n" + speech = "{}{} {}\n".format(speech, speech_start, speech_end) max_time = max(speech_end, max_time) elif is_other_type2(line): other_type2_start, other_type2_end = extract_other_type2(line) - other_type2 = other_type2 + str(other_type2_start) + " " + str(other_type2_end) + "\n" + other_type2 = "{}{} {}\n".format(other_type2, other_type2_start, other_type2_end) max_time = max(other_type2_end, max_time) elif is_music(line): time, is_on = extract_music(line) @@ -105,7 +106,7 @@ def process_file(annos): prev_music_time = time start_new_music_segment = False elif not is_on and not start_new_music_segment: - music = music + str(prev_music_time) + " " + str(time) + "\n" + music = "{}{} {}\n".format(music, prev_music_time, time) start_new_music_segment = True elif is_other_type1(line): time, is_on = extract_other_type1(line) @@ -114,13 +115,13 @@ def process_file(annos): prev_other_time = time start_new_other_segment = False elif not is_on and not start_new_other_segment: - other_type1 = other_type1 + str(prev_other_time) + " " + str(time) + "\n" + other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, time) start_new_other_segment = True if not start_new_music_segment: - music = music + str(prev_music_time) + " " + str(max_time) + "\n" + music = "{}{} {}\n".format(music, prev_music_time, max_time) if not start_new_other_segment: - other_type1 = other_type1 + str(prev_other_time) + " " + str(max_time) + "\n" + other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, max_time) other = other_type1 + other_type2 return speech, music, other diff --git a/egs/bn_music_speech/v1/local/make_bn.py b/egs/bn_music_speech/v1/local/make_bn.py index 98836d32534..7ec9aabcbdf 100755 --- a/egs/bn_music_speech/v1/local/make_bn.py +++ b/egs/bn_music_speech/v1/local/make_bn.py @@ -20,7 +20,7 @@ for file in files: utt = str(file).replace(".sph", "") if file.endswith(".sph") and utt in utts: - wav = wav + utt + " sox " + subdir + "/" + utt + ".sph" + " -c 1 -r 16000 -t wav - |\n" + wav = "{0}{1} sox {2}/{1}.sph -c 1 -r 16000 -t -wav - |\n".format(wav, utt, subdir) wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') wav_fi.write(wav) @@ -32,14 +32,14 @@ count = 1 for line in music_fi: left, right = line.rstrip().split(" ") - segments = segments + utt + "-music-" + str(count) + " " + utt + " " + left + " " + right + "\n" - utt2spk = utt2spk + utt + "-music-" + str(count) + " " + utt + "-music-" + str(count) + "\n" + segments = "{0}{1}-music-{2} {1} {3} {4}\n".format(segments, utt, count, left, right) + utt2spk = "{0}{1}-music-{2} {1}-music-{2}".format(utt2spk, utt,count) count += 1 count = 1 for line in speech_fi: left, right = line.rstrip().split(" ") - segments = segments + utt + "-speech-" + str(count) + " " + utt + " " + left + " " + right + "\n" - utt2spk = utt2spk + utt + "-speech-" + str(count) + " " + utt + "-speech-" + str(count) + "\n" + segments = "{0}{1}-speech-{2} {1} {3} {4}\n".format(segments, utt, count, left, right) + utt2spk = "{0}{1}-speech-{2} {1}-music-{2}".format(utt2spk, utt, count) count += 1 utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') utt2spk_fi.write(utt2spk) diff --git a/egs/bn_music_speech/v1/local/make_musan.py b/egs/bn_music_speech/v1/local/make_musan.py index b3795fe2b7d..942973cfc65 100755 --- a/egs/bn_music_speech/v1/local/make_musan.py +++ b/egs/bn_music_speech/v1/local/make_musan.py @@ -43,9 +43,9 @@ def prepare_music(root_dir, use_vocals): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_speech(root_dir): @@ -69,9 +69,9 @@ def prepare_speech(root_dir): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_noise(root_dir): @@ -95,9 +95,9 @@ def prepare_noise(root_dir): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def main(): diff --git a/egs/bn_music_speech/v1/local/print_scores.py b/egs/bn_music_speech/v1/local/print_scores.py index c2b587cdcad..e563afb63d7 100755 --- a/egs/bn_music_speech/v1/local/print_scores.py +++ b/egs/bn_music_speech/v1/local/print_scores.py @@ -11,6 +11,7 @@ # those strings to determine if it is a target or nontarget # utterance. We arbitrarily pick music to be the target class. +from __future__ import print_function import sys utt2score = open(sys.argv[1], 'r').readlines() for i in range(0, len(utt2score)): @@ -19,4 +20,4 @@ type = "target" else: type = "nontarget" - print score, type + print(score, type) diff --git a/egs/bn_music_speech/v1/local/refine_annotations_bn.py b/egs/bn_music_speech/v1/local/refine_annotations_bn.py index 52ac87c8640..31cb1803f57 100755 --- a/egs/bn_music_speech/v1/local/refine_annotations_bn.py +++ b/egs/bn_music_speech/v1/local/refine_annotations_bn.py @@ -10,6 +10,7 @@ # designated length are created. # # This file is meant to be invoked from make_bn.sh. +from __future__ import division import sys, os def seg_to_string(seg): @@ -23,7 +24,7 @@ def seg_to_string(seg): def process_segs(raw_segs): segs = [] for seg in raw_segs: - lower, upper = map(float, seg.rstrip().split(" ")) + lower, upper = [float(i) for i in seg.rstrip().split(" ")] segs.append((lower, upper)) return segs @@ -60,8 +61,8 @@ def resegment(music, speech, other, frame_length, min_seg): start_frame = 0 for i in range(1, len(frame2classes)): if curr_class != frame2classes[i]: - start = float(start_frame) / frame_length - end = float(i) / frame_length + start = float(start_frame)/frame_length + end = float(i)/frame_length if end - start > min_seg: if curr_class == "music": new_music.append((start, end)) diff --git a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh index 882b5800908..d7bb389bad5 100755 --- a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh +++ b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh @@ -92,7 +92,7 @@ if [ $stage -le 0 ]; then fi utils/data/get_uniform_subsegments.py \ --max-segment-duration=$window \ - --overlap-duration=$(echo "$window-$period" | bc) \ + --overlap-duration=$(perl -e "print $window-$period") \ --max-remaining-duration=$min_segment \ --constant-duration=True \ $segments > $dir/subsegments diff --git a/egs/callhome_diarization/v1/local/make_musan.py b/egs/callhome_diarization/v1/local/make_musan.py index b3f6652ba40..974e73e0777 100755 --- a/egs/callhome_diarization/v1/local/make_musan.py +++ b/egs/callhome_diarization/v1/local/make_musan.py @@ -43,9 +43,9 @@ def prepare_music(root_dir, use_vocals): utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file: {}".format(utt)) num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files) return utt2spk_str, utt2wav_str def prepare_speech(root_dir): @@ -69,9 +69,9 @@ def prepare_speech(root_dir): utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file: {}".format(utt)) num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files) return utt2spk_str, utt2wav_str def prepare_noise(root_dir): @@ -95,9 +95,9 @@ def prepare_noise(root_dir): utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file: {}".format(utt)) num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files) return utt2spk_str, utt2wav_str def main(): diff --git a/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py index f5b69a1ff86..7192ff7a1cc 100644 --- a/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py +++ b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py @@ -1,3 +1,4 @@ +from __future__ import print_function #!/usr/bin/env py # Converts a romanized ECA word list (symbol table) to @@ -7,9 +8,9 @@ import codecs if len(sys.argv) < 3: - print "USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]" - print "E.g., local/convert_symtable_to_utf.py data/lang/words.txt \ - /export/corpora/LDC/LDC99L22" + print("USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]") + print("E.g., local/convert_symtable_to_utf.py data/lang/words.txt \ + /export/corpora/LDC/LDC99L22") sys.exit(1) # Note that the ECA lexicon's default encoding is ISO-8859-6, not UTF8 diff --git a/egs/callhome_egyptian/s5/local/splits/get_conversation.py b/egs/callhome_egyptian/s5/local/splits/get_conversation.py index c999d3e597e..80f66174e2b 100755 --- a/egs/callhome_egyptian/s5/local/splits/get_conversation.py +++ b/egs/callhome_egyptian/s5/local/splits/get_conversation.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +from __future__ import print_function import os import re @@ -37,14 +38,14 @@ evaltest[pathComponents[12]] = numberOfConversations testConv = testConv + numberOfConversations -print "==============Train===============" -print train -print "Total Conversations in train = " + str(trainConv) -print "==============Dev===============" -print devtest -print "Total Conversations in dev = " + str(devConv) -print "==============Test===============" -print evaltest -print "Total Conversations in test = " + str(testConv) -print "=================================" -print "Total Conversations in Corpus = " + str(trainConv + devConv + testConv) +print("==============Train===============") +print(train) +print("Total Conversations in train = {}".format(trainConv)) +print("==============Dev===============") +print(devtest) +print("Total Conversations in dev = {}".format(devConv)) +print("==============Test===============") +print(evaltest) +print("Total Conversations in test = {}".format(testConv)) +print("=================================") +print("Total Conversations in Corpus = {}".format(trainConv + devConv + testConv)) diff --git a/egs/chime5/s5/local/json2text.py b/egs/chime5/s5/local/json2text.py index 4df0160efb6..a0142ad916e 100755 --- a/egs/chime5/s5/local/json2text.py +++ b/egs/chime5/s5/local/json2text.py @@ -25,8 +25,8 @@ def hms_to_seconds(hms): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('json', type=str, help='JSON transcription file') - parser.add_argument('--mictype', type=str, + parser.add_argument('json', help='JSON transcription file') + parser.add_argument('--mictype', choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'], help='Type of microphones') args = parser.parse_args() diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py index 44e17028695..33996c8eef1 100755 --- a/egs/cifar/v1/image/get_allowed_lengths.py +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -10,6 +10,7 @@ file is later used by make_features.py to pad each image sufficiently so that they all have an allowed length. This is intended for end2end chain training. """ +from __future__ import division import argparse import os @@ -124,7 +125,7 @@ def find_allowed_durations(start_len, end_len, args): def main(): args = get_args() - args.factor = 1.0 + args.factor / 100.0 + args.factor = 1.0 + args.factor/100.0 image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames')) @@ -133,7 +134,7 @@ def main(): "Coverage rate: {}%".format(start_dur, end_dur, 100.0 - args.coverage_factor * 2)) logger.info("There will be {} unique allowed lengths " - "for the images.".format(int(math.log(end_dur / start_dur) / + "for the images.".format(int((math.log(float(end_dur)/start_dur))/ math.log(args.factor)))) allowed_durations = find_allowed_durations(start_dur, end_dur, args) diff --git a/egs/cifar/v1/image/matrix_to_image.py b/egs/cifar/v1/image/matrix_to_image.py index 52dcead7479..908b1f8b3ed 100755 --- a/egs/cifar/v1/image/matrix_to_image.py +++ b/egs/cifar/v1/image/matrix_to_image.py @@ -26,6 +26,7 @@ copy-feats --binary=false $(grep $imgid data/train/feats.scp | cut -d' ' -f2) - | \ image/matrix_to_image.py --color=1 > $imgid.bmp """ +from __future__ import division import argparse import sys @@ -59,7 +60,7 @@ num_cols = len(line) # initialize if len(line) != num_cols: raise Exception("All rows should be of the same length") - line = map(float, line) # string to float + line = [float(i) for i in line] # string to float if max(line) > 1: raise Excetion("Element value in the matrix should be normalized and no larger than 1") line = [int(x * 255) for x in line] # float to integer ranging from 0 to 255 @@ -70,7 +71,7 @@ if num_cols % 3 != 0: raise Exception("Number of columns should be a multiple of 3 in the color mode") width = num_rows - height = num_cols / 3 + height = num_cols/3 # reform the image matrix image_array = [[0 for i in range(width * 3)] for j in range(height)] for i in range(height): diff --git a/egs/cifar/v1/image/select_image_in_egs.py b/egs/cifar/v1/image/select_image_in_egs.py index 88d7d568e66..dbf48e6403d 100755 --- a/egs/cifar/v1/image/select_image_in_egs.py +++ b/egs/cifar/v1/image/select_image_in_egs.py @@ -9,6 +9,7 @@ # --vertical-shift=0.3 --srand=27 --num-channels=3 ark:exp/cifar10_egs/egs.1.ark ark,t:- | \ # image/select_image_in_egs.py $id | image/matrix_to_image.py --color 3 > $id.bmp +from __future__ import print_function import argparse import sys diff --git a/egs/cifar/v1/local/process_data.py b/egs/cifar/v1/local/process_data.py index 51173dafc6f..38a599297d2 100755 --- a/egs/cifar/v1/local/process_data.py +++ b/egs/cifar/v1/local/process_data.py @@ -6,6 +6,7 @@ """ This script prepares the training and test data for CIFAR-10 or CIFAR-100. """ +from __future__ import division import argparse import os @@ -14,13 +15,13 @@ parser = argparse.ArgumentParser(description="""Converts train/test data of CIFAR-10 or CIFAR-100 to Kaldi feature format""") -parser.add_argument('database', type=str, +parser.add_argument('database', default='data/dl/cifar-10-batches-bin', help='path to downloaded cifar data (binary version)') -parser.add_argument('dir', type=str, help='output dir') -parser.add_argument('--cifar-version', type=str, default='CIFAR-10', choices=['CIFAR-10', 'CIFAR-100']) -parser.add_argument('--dataset', type=str, default='train', choices=['train', 'test']) -parser.add_argument('--out-ark', type=str, default='-', help='where to write output feature data') +parser.add_argument('dir', help='output dir') +parser.add_argument('--cifar-version', default='CIFAR-10', choices=['CIFAR-10', 'CIFAR-100']) +parser.add_argument('--dataset', default='train', choices=['train', 'test']) +parser.add_argument('--out-ark', default='-', help='where to write output feature data') args = parser.parse_args() @@ -37,7 +38,7 @@ def load_cifar10_data_batch(datafile): for i in range(num_images_in_batch): label = ord(fh.read(1)) bin_img = fh.read(C * H * W) - img = [[[ord(byte) / 255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]] + img = [[[ord(byte)/255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]] for row in range(H)] for channel in range(C)] labels += [label] data += [img] @@ -52,7 +53,7 @@ def load_cifar100_data_batch(datafile, num_images_in_batch): coarse_label = ord(fh.read(1)) fine_label = ord(fh.read(1)) bin_img = fh.read(C * H * W) - img = [[[ord(byte) / 255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]] + img = [[[ord(byte)/255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]] for row in range(H)] for channel in range(C)] fine_labels += [fine_label] coarse_labels += [coarse_label] @@ -80,7 +81,7 @@ def write_kaldi_matrix(file_handle, matrix, key): if num_cols != len(matrix[row_index]): raise Exception("All the rows of a matrix are expected to " "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + file_handle.write(" ".join([str(x) for x in matrix[row_index]])) if row_index != num_rows - 1: file_handle.write("\n") file_handle.write(" ]\n") diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_dev.py b/egs/dihard_2018/v1/local/make_dihard_2018_dev.py index 71b2b1b0143..fa652da8b4c 100755 --- a/egs/dihard_2018/v1/local/make_dihard_2018_dev.py +++ b/egs/dihard_2018/v1/local/make_dihard_2018_dev.py @@ -35,7 +35,7 @@ def prepare_dihard_2018_dev(src_dir, data_dir): rttm_fi.write(rttm_str) with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh: rttm_list = fh.readlines() - spk_list = map(lambda x: (x.split())[7], rttm_list) + spk_list = [(x.split())[7] for x in rttm_list] num_spk = len(set(spk_list)) reco2num_spk_fi.write("{} {}\n".format(utt, num_spk)) wavscp_fi.close() diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_eval.py b/egs/dihard_2018/v1/local/make_dihard_2018_eval.py index f8bd434f51a..2a8acbee58d 100755 --- a/egs/dihard_2018/v1/local/make_dihard_2018_eval.py +++ b/egs/dihard_2018/v1/local/make_dihard_2018_eval.py @@ -35,7 +35,7 @@ def prepare_dihard_2018_eval(src_dir, data_dir): rttm_fi.write(rttm_str) with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh: rttm_list = fh.readlines() - spk_list = map(lambda x: (x.split())[7], rttm_list) + spk_list = [(x.split())[7] for x in rttm_list] num_spk = len(set(spk_list)) reco2num_spk_fi.write("{} {}\n".format(utt, num_spk)) wavscp_fi.close() diff --git a/egs/dihard_2018/v2/local/make_musan.py b/egs/dihard_2018/v2/local/make_musan.py index 74c434990fb..c4b5c9359b4 100755 --- a/egs/dihard_2018/v2/local/make_musan.py +++ b/egs/dihard_2018/v2/local/make_musan.py @@ -47,9 +47,9 @@ def prepare_music(root_dir, use_vocals): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_speech(root_dir): @@ -73,9 +73,9 @@ def prepare_speech(root_dir): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_noise(root_dir): @@ -99,9 +99,9 @@ def prepare_noise(root_dir): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def main(): diff --git a/egs/fame/v1/local/prepare_for_eer.py b/egs/fame/v1/local/prepare_for_eer.py index 59d2985e7c2..f1dbcfa9ab6 100755 --- a/egs/fame/v1/local/prepare_for_eer.py +++ b/egs/fame/v1/local/prepare_for_eer.py @@ -1,3 +1,4 @@ +from __future__ import print_function # Copyright 2015 David Snyder # Apache 2.0. # @@ -12,4 +13,4 @@ spkrutt2target[spkr+utt]=target for line in scores: spkr, utt, score = line.strip().split() - print score, spkrutt2target[spkr+utt] + print(score, spkrutt2target[spkr+utt]) diff --git a/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf b/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..d870ab04c38 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) diff --git a/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf b/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py b/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py index 9112d868c25..4c96e01ce7e 100755 --- a/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py +++ b/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py @@ -5,6 +5,7 @@ # The list of files in the conversations for which 1 best output has to be extracted # words.txt +from __future__ import print_function import os import sys import subprocess @@ -76,7 +77,7 @@ def findLattice(timeDetail): # Concatenate lattices mergedTranslation = latticeConcatenate(mergedTranslation, tmp) - print mergedTranslation + print(mergedTranslation) if mergedTranslation != "": # Sanjeev's Recipe : Remove epsilons and topo sort @@ -95,16 +96,16 @@ def findLattice(timeDetail): # file so it can be checked later proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) line = proc.stdout.readline() - print line + " " + str(lineNo) + print("{} {}".format(line, lineNo)) if line.strip() != "PLF format appears to be correct.": os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) else: provFile.write(PLFline) else: blankPLF.write(timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) # Now convert to PLF lineNo += 1 diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh new file mode 100755 index 00000000000..c487f1bd222 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh @@ -0,0 +1,288 @@ +#!/bin/bash + +# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e. +# with bypass resnet connections, and re-tuned. +# compute-wer --text --mode=present ark:exp/chain/multipsplice_tdnn/decode_fsp_train_test/scoring_kaldi/test_filt.txt ark,p:- +# %WER 22.21 [ 8847 / 39831, 1965 ins, 2127 del, 4755 sub ] +# %SER 56.98 [ 3577 / 6278 ] +# Scored 6278 sentences, 0 not present in hyp. + +# steps/info/chain_dir_info.pl exp/chain/multipsplice_tdnn +# exp/chain/multipsplice_tdnn: num-iters=296 nj=1..2 num-params=8.2M dim=40+100->2489 combine=-0.170->-0.165 (over 8) xent:train/valid[196,295,final]=(-2.30,-1.93,-1.83/-2.24,-1.96,-1.86) logprob:train/valid[196,295,final]=(-0.208,-0.169,-0.164/-0.189,-0.161,-0.158) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train +test_sets="test dev" +gmm=tri5a # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1g #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 17 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 18 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 19 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 20 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand $srand \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.frames-per-iter 5000000 \ + --trainer.optimization.num-jobs-initial 1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate 0.0005 \ + --trainer.optimization.final-effective-lrate 0.00005 \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.optimization.momentum 0.0 \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context 0 \ + --egs.chunk-right-context 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --cleanup.remove-egs $remove_egs \ + --use-gpu true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir exp/tri5a_lats_nodup_sp \ + --dir $dir || exit 1; +fi + +if [ $stage -le 21 ]; then + # The reason we are using data/lang_test here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + #LM was trained only on Fisher Spanish train subset. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph_fsp_train || exit 1; + +fi + +rnnlmdir=exp/rnnlm_lstm_tdnn_1b +if [ $stage -le 22 ]; then + local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; +fi + +if [ $stage -le 23 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l &1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) line = proc.stdout.readline() - print line + " " + str(lineNo) + print("{} {}".format(line, lineNo)) if line.strip() != "PLF format appears to be correct.": os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) else: provFile.write(PLFline) else: blankPLF.write(timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) # Now convert to PLF lineNo += 1 diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py index 5c09f09bc35..864b76b671b 100755 --- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py +++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py @@ -4,6 +4,7 @@ # # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon +from __future__ import print_function import sys import json import codecs @@ -24,8 +25,7 @@ merged_lexicon.append(line.strip()) fisher.close() -print "After adding the fisher data, the lexicon contains " \ - + str(len(merged_lexicon)) + " entries." +print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon))) # Now add data from the LDC lexicon ldc = codecs.open(uw_LDC, encoding='iso-8859-1') @@ -34,12 +34,11 @@ if entries[0].lower() not in merged_lexicon: merged_lexicon.append(entries[0].lower()) -print "After adding the LDC data, the lexicon contains " \ - + str(len(merged_lexicon)) + " entries." +print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon))) # Finally add the gigaword data gigaword = json.load(open(uw_gigaword)) -gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1))) +gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1))) for item in gigaword: # We need a maximum of wordlimit words in the lexicon @@ -49,8 +48,7 @@ if item[0].lower() not in merged_lexicon: merged_lexicon.append(item[0].lower()) -print "After adding the Gigaword data, the lexicon contains " \ - + str(len(merged_lexicon)) + " entries." +print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon))) # Now write the uniquewords to a file lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+') @@ -61,4 +59,4 @@ lf.close() -print "Finshed writing unique words" +print("Finshed writing unique words") diff --git a/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..cc9de4d26c5 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,187 @@ +#!/bin/bash + +set -e -o pipefail + +# This script is called from scripts like local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It +# contains the common feature preparation and iVector-related parts of the +# script. See those scripts for examples of usage. + + +stage=7 +nj=30 +train_set=train # you might set this to e.g. train. +test_sets="test dev" +gmm=tri5a # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff in (e.g. + # in the tedlium recip it's _cleaned). + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 7 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 7." + exit 1 +fi + + +if [ $stage -le 8 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 9 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 10 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + # train a diagonal UBM using a subset of about a quarter of the data + num_utts_total=$(wc -l $text_dir/ami.txt + cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt +fi + +if [ $stage -le 1 ]; then + cp $wordlist $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --top-word-features 10000 \ + --min-frequency 1.0e-03 \ + --special-words=',,,,[noise],[laughter]' \ + $dir/config/words.txt > $dir/config/features.txt + +lstm_opts="l2-regularize=$comp_l2" +tdnn_opts="l2-regularize=$comp_l2" +output_opts="l2-regularize=$output_l2" + + cat >$dir/config/xconfig <&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) line = proc.stdout.readline() - print line + " " + str(lineNo) + print("{} {}".format(line, lineNo)) if line.strip() != "PLF format appears to be correct.": os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) else: provFile.write(PLFline) else: blankPLF.write(timeInfo[0] + "\n") - rmLines.write(str(lineNo) + "\n") + rmLines.write("{}\n".format(lineNo)) # Now convert to PLF lineNo += 1 diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh index 1a6fb5f891b..17ffb0369f8 100755 --- a/egs/fisher_callhome_spanish/s5/path.sh +++ b/egs/fisher_callhome_spanish/s5/path.sh @@ -3,3 +3,4 @@ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs diff --git a/egs/fisher_callhome_spanish/s5/rnnlm b/egs/fisher_callhome_spanish/s5/rnnlm new file mode 120000 index 00000000000..fb754622d5e --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/rnnlm @@ -0,0 +1 @@ +../../wsj/s5/rnnlm \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 57902a98fed..6e2752a7b68 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -1,20 +1,22 @@ #!/bin/bash # +# Copyright 2018 Nagendra Goel, Saikiran Valluri Apache 2.0 # Copyright 2014 Gaurav Kumar. Apache 2.0 # Recipe for Fisher/Callhome-Spanish -# Made to integrate KALDI with JOSHUA for end-to-end ASR and SMT stage=0 +train_stage=-20 +train_sgmm2=false # call the next line with the directory where the Spanish Fisher data is # (the values below are just an example). -sfisher_speech=/veu4/jadrian/data/LDC/LDC2010S01 -sfisher_transcripts=/veu4/jadrian/data/LDC/LDC2010T04 -spanish_lexicon=/veu4/jadrian/data/LDC/LDC96L16 +sfisher_speech=/export/corpora/LDC/LDC2010S01 +sfisher_transcripts=/export/corpora/LDC/LDC2010T04 +spanish_lexicon=/export/corpora/LDC/LDC96L16 split=local/splits/split_fisher -callhome_speech=/veu4/jadrian/data/LDC/LDC96S35 -callhome_transcripts=/veu4/jadrian/data/LDC/LDC96T17 +callhome_speech=/export/corpora/LDC/LDC96S35 +callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome mfccdir=`pwd`/mfcc @@ -25,7 +27,7 @@ if [ -f path.sh ]; then . ./path.sh; fi set -e -if [ $stage -lt 1 ]; then +if [ $stage -le 1 ]; then local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts local/callhome_data_prep.sh $callhome_speech $callhome_transcripts @@ -95,7 +97,7 @@ if [ $stage -lt 1 ]; then local/callhome_create_splits.sh $split_callhome fi -if [ $stage -lt 2 ]; then +if [ $stage -le 2 ]; then # Now compute CMVN stats for the train, dev and test subsets steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir @@ -124,90 +126,95 @@ if [ $stage -lt 2 ]; then utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k fi +if [ $stage -le 3 ]; then + steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_10k_nodup data/lang exp/mono0a -steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ - data/train_10k_nodup data/lang exp/mono0a + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1; -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1; - -steps/train_deltas.sh --cmd "$train_cmd" \ + steps/train_deltas.sh --cmd "$train_cmd" \ 2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1; -(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri1/graph data/dev exp/tri1/decode_dev)& + (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri1/graph data/dev exp/tri1/decode_dev)& -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1; + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1; -steps/train_deltas.sh --cmd "$train_cmd" \ + steps/train_deltas.sh --cmd "$train_cmd" \ 2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1; -( - utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1; -)& - + ( + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1; + )& +fi -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1; +if [ $stage -le 4 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1; # Train tri3a, which is LDA+MLLT, on 100k data. -steps/train_lda_mllt.sh --cmd "$train_cmd" \ + steps/train_lda_mllt.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ 3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1; -( - utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; -)& - + ( + utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; + )& +fi +if [ $stage -le 5 ]; then # Next we'll use fMLLR and train with SAT (i.e. on # fMLLR features) -steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1; + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1; -steps/train_sat.sh --cmd "$train_cmd" \ - 4000 60000 data/train_100k data/lang exp/tri3a_ali exp/tri4a || exit 1; + steps/train_sat.sh --cmd "$train_cmd" \ + 4000 60000 data/train_100k data/lang exp/tri3a_ali exp/tri4a || exit 1; -( - utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri4a/graph data/dev exp/tri4a/decode_dev + ( + utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri4a/graph data/dev exp/tri4a/decode_dev )& -steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train data/lang exp/tri4a exp/tri4a_ali || exit 1; + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train data/lang exp/tri4a exp/tri4a_ali || exit 1; # Reduce the number of gaussians -steps/train_sat.sh --cmd "$train_cmd" \ - 5000 120000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 120000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; -( - utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/dev exp/tri5a/decode_dev - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/test exp/tri5a/decode_test + ( + utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/dev exp/tri5a/decode_dev + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/test exp/tri5a/decode_test # Decode CALLHOME - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train -) & - + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train + ) & + + + steps/align_fmllr.sh \ + --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \ + data/train data/lang exp/tri5a exp/tri5a_ali +fi -steps/align_fmllr.sh \ - --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \ - data/train data/lang exp/tri5a exp/tri5a_ali +if $train_sgmm2; then steps/train_ubm.sh \ --cmd "$train_cmd" 750 \ @@ -258,22 +265,7 @@ for iter in 1 2 3 4; do done ) & -dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \ - --parallel-opts "--num-threads 16") -dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \ - --parallel-opts "--gpu 1") - -steps/nnet2/train_pnorm_ensemble.sh \ - --mix-up 5000 --initial-learning-rate 0.008 --final-learning-rate 0.0008\ - --num-hidden-layers 4 --pnorm-input-dim 2000 --pnorm-output-dim 200\ - --cmd "$train_cmd" \ - "${dnn_gpu_parallel_opts[@]}" \ - --ensemble-size 4 --initial-beta 0.1 --final-beta 5 \ - data/train data/lang exp/tri5a_ali exp/tri6a_dnn +fi -( - steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 \ - --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev -) & -wait +local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1; exit 0; diff --git a/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py b/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py index 3c447c5976a..75cc4458d85 100755 --- a/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py +++ b/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py @@ -10,6 +10,7 @@ # en_4156 B 414.58 0.16 l # en_4156 B 414.74 0.17 a +from __future__ import division import argparse,re __author__ = 'Minhua Wu' @@ -27,7 +28,7 @@ if items[4].find(".") != -1: letters = items[4].split("._") acronym_period = round(float(items[3]), 2) - letter_slot = round(acronym_period / len(letters), 2) + letter_slot = round(acronym_period/len(letters), 2) time_start = round(float(items[2]), 2) for l in letters[:-1]: time = " %.2f %.2f " % (time_start, letter_slot) diff --git a/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py b/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py index 59814beb4ea..c3f9af09c99 100755 --- a/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py +++ b/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py @@ -10,6 +10,7 @@ # en_4156 B 414.58 0.16 l # en_4156 B 414.74 0.17 a +from __future__ import division import argparse,re __author__ = 'Minhua Wu' @@ -27,7 +28,7 @@ if items[4].find(".") != -1: letters = items[4].split("._") acronym_period = round(float(items[3]), 2) - letter_slot = round(acronym_period / len(letters), 2) + letter_slot = round(acronym_period/ len(letters), 2) time_start = round(float(items[2]), 2) for l in letters[:-1]: time = " %.2f %.2f " % (time_start, letter_slot) diff --git a/egs/gale_mandarin/s5/local/gale_segment.py b/egs/gale_mandarin/s5/local/gale_segment.py index 975ddb9c143..d652eb837f3 100755 --- a/egs/gale_mandarin/s5/local/gale_segment.py +++ b/egs/gale_mandarin/s5/local/gale_segment.py @@ -1,6 +1,7 @@ #!/usr/bin/env python #coding:utf-8 #!/usr/bin/env python +from __future__ import print_function import sys from mmseg import seg_txt for line in sys.stdin: @@ -12,4 +13,4 @@ continue for j in seg_txt(blks[i]): out_line += " " + j - print out_line + print(out_line) diff --git a/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py index be0c7ad8e0d..5675dc3fbd9 100755 --- a/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py +++ b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py @@ -31,9 +31,9 @@ def get_args(): parser = argparse.ArgumentParser("Process 1995 CSR-IV HUB4 transcripts") - parser.add_argument("--noise-word", type=str, default="", + parser.add_argument("--noise-word", default="", help="Word to add in-place of noise words") - parser.add_argument("--spoken-noise-word", type=str, + parser.add_argument("--spoken-noise-word", default="", help="Word to add in-place of speaker noise words") parser.add_argument("in_file", type=argparse.FileType('r'), @@ -230,7 +230,7 @@ def run(args): start_time = story_end_time segments = process_story_content( args, reco_id, - ' '.join([unicode(x) for x in s.children]), + ' '.join([str(x) for x in s.children]), start_time=story_begin_time, end_time=story_end_time) write_segments(segments, args) elif (s.name is not None and s.name != "language" @@ -240,9 +240,9 @@ def run(args): "or or ; got {0}".format(s)) elif s.name == "language" or s.name == "sung": non_story_contents.append( - ' '.join([unicode(x) for x in s.children])) + ' '.join([str(x) for x in s.children])) else: - non_story_contents.append(unicode(s)) + non_story_contents.append(str(s)) except RuntimeError: raise except Exception: diff --git a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py index 95aa7ddb831..fb5ba7a64ee 100755 --- a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py +++ b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py @@ -36,9 +36,9 @@ def get_args(): corpus (LDC98T31).""") parser.add_argument("--verbose", choices=[0,1,2,3], type=int, default=0, help="Set higher for more verbose logging.") - parser.add_argument("file_list", type=str, + parser.add_argument("file_list", help="""List of compressed source files""") - parser.add_argument("dir", type=str, + parser.add_argument("dir", help="Output directory to dump processed files to") args = parser.parse_args() @@ -83,7 +83,7 @@ def process_file_lines(lines, out_file_handle): for x in para.contents: try: if x.name is None: - normalized_text = normalize_text(unicode(x)) + normalized_text = normalize_text(str(x)) if len(normalized_text) == 0: continue out_file_handle.write("{0}\n".format( diff --git a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py index 94b02a766a9..08203f7ada1 100755 --- a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py +++ b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py @@ -38,10 +38,10 @@ def get_args(): parser = argparse.ArgumentParser("Prepare NA News Text corpus (LDC95T21).") parser.add_argument("--verbose", type=int, choices=[0, 1, 2, 3], default=0, help="Use larger verbosity for more verbose logging.") - parser.add_argument("file_list", type=str, + parser.add_argument("file_list", help="List of compressed source files for NA News Text. " "e.g: /export/corpora/LDC/LDC95T21/na_news_1/latwp/1994") - parser.add_argument("out_file", type=str, + parser.add_argument("out_file", help="Output file to write to.") args = parser.parse_args() @@ -85,7 +85,7 @@ def process_file_lines(lines, out_file_handle): continue for para in art.find_all('p'): assert para.name == 'p' - text = ' '.join([unicode(x).strip() for x in para.contents]) + text = ' '.join([str(x).strip() for x in para.contents]) normalized_text = normalize_text(text) out_file_handle.write("{0}\n".format( normalized_text.encode('ascii'))) diff --git a/egs/hub4_english/s5/local/lm/merge_word_counts.py b/egs/hub4_english/s5/local/lm/merge_word_counts.py index 6338cbbf875..85e15d8dc07 100755 --- a/egs/hub4_english/s5/local/lm/merge_word_counts.py +++ b/egs/hub4_english/s5/local/lm/merge_word_counts.py @@ -7,6 +7,7 @@ A min-count argument is required to only write counts that are above the specified minimum count. """ +from __future__ import print_function import sys @@ -21,7 +22,7 @@ def main(): parts = line.strip().split() words[parts[1]] = words.get(parts[1], 0) + int(parts[0]) - for word, count in words.iteritems(): + for word, count in words.items(): if count >= int(sys.argv[1]): print ("{0} {1}".format(count, word)) diff --git a/egs/hub4_spanish/s5/local/lexicon/make_unicode_lexicon.py b/egs/hub4_spanish/s5/local/lexicon/make_unicode_lexicon.py index 25f26f38a4f..69b4e374b6e 100755 --- a/egs/hub4_spanish/s5/local/lexicon/make_unicode_lexicon.py +++ b/egs/hub4_spanish/s5/local/lexicon/make_unicode_lexicon.py @@ -106,6 +106,7 @@ # Import Statements from __future__ import print_function +from __future__ import division import codecs import argparse import unicodedata @@ -338,8 +339,8 @@ def encode(unicode_transcription, tag_percentage, log=False): graph2int = {v: k for k, v in enumerate(set(graph_list))} int2graph = {v: k for k, v in graph2int.items()} graph_list_int = [graph2int[g] for g in graph_list] - bin_edges = range(0, len(int2graph.keys()) + 1) - graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0] / float(len(graph_list_int)) + bin_edges = list(range(0, len(int2graph.keys()) + 1)) + graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0]/ float(len(graph_list_int)) # Set count threshold to frequency that tags the bottom 10% of graphemes bottom_idx = int(np.floor(tag_percentage * len(graph_counts))) count_thresh = sorted(graph_counts)[bottom_idx] @@ -464,7 +465,7 @@ def encode(unicode_transcription, tag_percentage, log=False): for g_dict in table: g_map = "" map_number = 0 - for g_field, g_val in sorted(g_dict.iteritems()): + for g_field, g_val in sorted(g_dict.items()): if(g_field == ("MAP" + str(map_number))): g_map = g_map + g_val + " " map_number = map_number + 1 @@ -594,7 +595,7 @@ def write_map(grapheme_map, mapfile): ''' with codecs.open(mapfile, 'w', encoding='utf-8') as f: - for g, g_map in grapheme_map.iteritems(): + for g, g_map in grapheme_map.items(): print(g, g_map, file=f) @@ -612,14 +613,14 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None, with codecs.open(outfile, "w", "utf-8") as f: # First write the non-speech words try: - for w in sil_lex.iterkeys(): + for w in sil_lex.keys(): f.write("%s\t%s\n" % (w, sil_lex[w])) except AttributeError: pass # Then write extra-speech words try: - for w in extra_lex.iterkeys(): + for w in extra_lex.keys(): f.write("%s\t%s\n" % (w, extra_lex[w])) except AttributeError: pass @@ -628,9 +629,9 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None, for idx, w in enumerate(baseforms): # This is really just for BABEL in case is written as a word if(w[0].lower() == ""): - f.write("%s\t\n" % (unicode(w[0]))) + f.write("%s\t\n" % (str(w[0]))) else: - f.write("%s\t%s\n" % (unicode(w[0]), + f.write("%s\t%s\n" % (str(w[0]), encoded_transcription[idx])) if __name__ == "__main__": diff --git a/egs/hub4_spanish/s5/local/prepare_unicode_dict.py b/egs/hub4_spanish/s5/local/prepare_unicode_dict.py index 86fa4d60ba1..3b9dc1abd86 100755 --- a/egs/hub4_spanish/s5/local/prepare_unicode_dict.py +++ b/egs/hub4_spanish/s5/local/prepare_unicode_dict.py @@ -89,7 +89,7 @@ def extract_phonemes(lexicon): # Read all baseform units into dictionary with {a: [a, a_1, a_2], # b: [b_1, b_3], ...} phonemes_dict = {} - for word, pron in lexicon.iteritems(): + for word, pron in lexicon.items(): for p in pron.split(): try: base = p.split("_",1)[0] @@ -98,11 +98,11 @@ def extract_phonemes(lexicon): phonemes_dict[base] = [p] # Makes sure there are no repeats in the list - phonemes_dict = {k: set(v) for k, v in phonemes_dict.iteritems()} + phonemes_dict = {k: set(v) for k, v in phonemes_dict.items()} # Get all unique phonemes phonemes = [] - for v in phonemes_dict.itervalues(): + for v in phonemes_dict.values(): for p in v: phonemes.append(p) @@ -137,11 +137,11 @@ def write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict, # Write all possible phone_tag combinations that occur in the lexicon for tag in tags: - for p in nonsil_phonemes_dict.iterkeys(): + for p in nonsil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in nonsil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) - for p in sil_phonemes_dict.iterkeys(): + for p in sil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in sil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) diff --git a/egs/iam/v2/local/gen_topo.py b/egs/iam/v2/local/gen_topo.py index 540bfbcf270..8ffc59c5788 100755 --- a/egs/iam/v2/local/gen_topo.py +++ b/egs/iam/v2/local/gen_topo.py @@ -9,6 +9,7 @@ # the number of states for other characters. from __future__ import print_function +from __future__ import division import argparse import string @@ -19,11 +20,11 @@ parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); -parser.add_argument("nonsilence_phones", type=str, +parser.add_argument("nonsilence_phones", help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); -parser.add_argument("silence_phones", type=str, +parser.add_argument("silence_phones", help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); -parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number."); +parser.add_argument("phone_list", help="file containing all phones and their corresponding number."); args = parser.parse_args() @@ -47,8 +48,8 @@ print("") for x in range(0, args.num_nonsil_states): xp1 = x + 1 - print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") -print(" " + str(args.num_nonsil_states) + " ") + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(x, xp1)) +print(" {} ".format(args.num_nonsil_states)) print("") # For nonsilence phones that ar punctuations @@ -58,8 +59,8 @@ print("") for x in range(0, args.num_punctuation_states): xp1 = x + 1 - print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") -print(" " + str(args.num_punctuation_states) + " ") + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(x, xp1)) +print(" {} ".format(args.num_punctuation_states)) print("") # For silence phones @@ -72,21 +73,21 @@ state_str = " 0 0 " for x in range(0, (args.num_sil_states - 1)): - state_str = state_str + " " + str(x) + " " + str(transp) + " " + state_str = "{} {} {} ".format(state_str, x, transp)) state_str = state_str + "" print(state_str) for x in range(1, (args.num_sil_states - 1)): - state_str = " " + str(x) + " " + str(x) + " " + state_str = " {0} " + str(y) + " " + str(transp) + " " + state_str = "{} {} {} ".format(state_str, y, transp)) state_str = state_str + "" print(state_str) second_last = args.num_sil_states - 1 - print(" " + str(second_last) + " " + str(second_last) + " " + str(second_last) + " 0.75 " + str(args.num_sil_states) + " 0.25 ") - print(" " + str(args.num_sil_states) + " ") + print(" {0} {0} {0} 0.75 {1} 0.25 ".format(second_last, args.num_sil_states)) + print(" {} ".format(args.num_sil_states)) else: print(" 0 0 0 0.75 1 0.25 ") - print(" " + str(args.num_sil_states) + " ") + print(" {} ".format(args.num_sil_states)) print("") print("") diff --git a/egs/ifnenit/v1/README.txt b/egs/ifnenit/README.txt similarity index 100% rename from egs/ifnenit/v1/README.txt rename to egs/ifnenit/README.txt diff --git a/egs/ifnenit/v1/local/make_features.py b/egs/ifnenit/v1/local/make_features.py index 3a485e32eb1..87afa37c00a 100755 --- a/egs/ifnenit/v1/local/make_features.py +++ b/egs/ifnenit/v1/local/make_features.py @@ -10,7 +10,7 @@ eg. local/make_features.py data/train --feat-dim 40 """ - +from __future__ import division import argparse import os @@ -24,8 +24,8 @@ signal(SIGPIPE,SIG_DFL) parser = argparse.ArgumentParser(description="""Generates and saves the feature vectors""") -parser.add_argument('dir', type=str, help='directory of images.scp and is also output directory') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file') +parser.add_argument('dir', help='directory of images.scp and is also output directory') +parser.add_argument('--out-ark', default='-', help='where to write the output feature file') parser.add_argument('--feat-dim', type=int, default=40, help='size to scale the height of all images') parser.add_argument('--padding', type=int, default=5, help='size to scale the height of all images') args = parser.parse_args() @@ -42,7 +42,7 @@ def write_kaldi_matrix(file_handle, matrix, key): if num_cols != len(matrix[row_index]): raise Exception("All the rows of a matrix are expected to " "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + file_handle.write(" ".join([str(x) for x in matrix[row_index]])) if row_index != num_rows - 1: file_handle.write("\n") file_handle.write(" ]\n") @@ -51,7 +51,7 @@ def get_scaled_image(im): scale_size = args.feat_dim sx = im.shape[1] sy = im.shape[0] - scale = (1.0 * scale_size) / sy + scale = (1.0 * scale_size)/ sy nx = int(scale_size) ny = int(scale * sx) im = misc.imresize(im, (nx, ny)) diff --git a/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh b/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh new file mode 120000 index 00000000000..a4fa11e0908 --- /dev/null +++ b/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1b.sh \ No newline at end of file diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..812bf5e7fc5 --- /dev/null +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# this is the tdnn-lstmp based on the run_tdnn_lstm_1n.sh under Switchboard. + +# training acoustic model and decoding: +# local/chain/tuning/run_tdnn_lstm_1a.sh +# System tdnn_lstm1a_sp +# WER on dev(fglarge) 3.44 +# WER on dev(tglarge) 3.55 +# WER on dev_other(fglarge) 8.63 +# WER on dev_other(tglarge) 9.09 +# WER on test(fglarge) 3.78 +# WER on test(tglarge) 3.94 +# WER on test_other(fglarge) 8.83 +# WER on test_other(tglarge) 9.09 +# Final train prob -0.0452 +# Final valid prob -0.0477 +# Final train prob (xent) -0.7874 +# Final valid prob (xent) -0.8150 +# Num-parameters 27790288 +# exp/chain_cleaned/tdnn_lstm1a_sp/: num-iters=1303 nj=3..16 num-params=27.8M dim=40+100->6056 combine=-0.041->-0.040 (over 9) xent:train/valid[867,1302,final]=(-1.15,-0.782,-0.787/-1.18,-0.810,-0.815) logprob:train/valid[867,1302,final]=(-0.063,-0.047,-0.045/-0.062,-0.049,-0.048) + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=1a +decode_iter= +decode_nj=50 + +# LSTM training options +frames_per_chunk=140,100,160 +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +remove_egs=false +common_egs_dir= +nnet3_affix=_cleaned +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=lstm1l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=lstm2l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=lstm3l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 include-log-softmax=false $output_opts + + output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + + +graph_dir=$dir/graph_tgsmall +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir + # remove from the graph, and convert back to const-FST. + fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \ + fstconvert --fst_type=const > $graph_dir/temp.fst + mv $graph_dir/temp.fst $graph_dir/HCLG.fst +fi + + +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh new file mode 100755 index 00000000000..d9f20fae011 --- /dev/null +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh @@ -0,0 +1,257 @@ +#!/bin/bash +# this is the tdnn-lstmp based on the run_tdnn_lstm_1a.sh under Librispeech but with larger model size. + +# training acoustic model and decoding: +# local/chain/tuning/run_tdnn_lstm_1b.sh +# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_lstm1a_sp exp/chain_cleaned/tdnn_lstm1b_sp +# System tdnn_lstm1a_sp tdnn_lstm1b_sp +# WER on dev(fglarge) 3.44 3.36 +# WER on dev(tglarge) 3.55 3.48 +# WER on dev(tgmed) 4.41 4.26 +# WER on dev(tgsmall) 4.82 4.71 +# WER on dev_other(fglarge) 8.63 8.43 +# WER on dev_other(tglarge) 9.09 8.94 +# WER on dev_other(tgmed) 10.99 10.65 +# WER on dev_other(tgsmall) 11.95 11.51 +# WER on test(fglarge) 3.78 3.83 +# WER on test(tglarge) 3.94 3.93 +# WER on test(tgmed) 4.68 4.72 +# WER on test(tgsmall) 5.11 5.10 +# WER on test_other(fglarge) 8.83 8.69 +# WER on test_other(tglarge) 9.09 9.10 +# WER on test_other(tgmed) 11.05 10.86 +# WER on test_other(tgsmall) 12.18 11.83 +# Final train prob -0.0452 -0.0417 +# Final valid prob -0.0477 -0.0459 +# Final train prob (xent) -0.7874 -0.7488 +# Final valid prob (xent) -0.8150 -0.7757 +# Num-parameters 27790288 45245520 + +# rnn-lm rescoring: +# local/rnnlm/tuning/run_tdnn_lstm_1a.sh --ac-model-dir exp/chain_cleaned/tdnn_lstm1b_sp/ +# System tdnn_lstm1b_sp +# WER on dev(fglarge_nbe_rnnlm) 2.73 +# WER on dev(fglarge_lat_rnnlm) 2.83 +# WER on dev(fglarge) 3.36 +# WER on dev(tglarge) 3.48 +# WER on dev_other(fglarge_nbe_rnnlm) 7.20 +# WER on dev_other(fglarge_lat_rnnlm) 7.23 +# WER on dev_other(fglarge) 8.43 +# WER on dev_other(tglarge) 8.94 +# WER on test(fglarge_nbe_rnnlm) 3.10 +# WER on test(fglarge_lat_rnnlm) 3.22 +# WER on test(fglarge) 3.83 +# WER on test(tglarge) 3.93 +# WER on test_other(fglarge_nbe_rnnlm) 7.54 +# WER on test_other(fglarge_lat_rnnlm) 7.65 +# WER on test_other(fglarge) 8.69 +# WER on test_other(tglarge) 9.10 +# Final train prob -0.0417 +# Final valid prob -0.0459 +# Final train prob (xent) -0.7488 +# Final valid prob (xent) -0.7757 +# Num-parameters 45245520 + + + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=1b +decode_iter= +decode_nj=50 + +# LSTM training options +frames_per_chunk=140,100,160 +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +remove_egs=false +common_egs_dir= +nnet3_affix=_cleaned +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=320 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=320 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=320 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=320 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=lstm1l dim=320 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm1 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=lstm2l dim=320 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm2 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=lstm3l dim=320 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm3 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384: delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 include-log-softmax=false $output_opts + + output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + + +graph_dir=$dir/graph_tgsmall +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir + # remove from the graph, and convert back to const-FST. + fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \ + fstconvert --fst_type=const > $graph_dir/temp.fst + mv $graph_dir/temp.fst $graph_dir/HCLG.fst +fi + + +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi diff --git a/egs/librispeech/s5/local/lm/python/text_post_process.py b/egs/librispeech/s5/local/lm/python/text_post_process.py index 4ffbbe04b1f..344c1b291bd 100755 --- a/egs/librispeech/s5/local/lm/python/text_post_process.py +++ b/egs/librispeech/s5/local/lm/python/text_post_process.py @@ -21,10 +21,10 @@ def parse_args(): parser.add_argument('--abort-long-sent', type=bool, default=False, help='If True and a sentence longer than "max-sent-len" detected' +\ 'exit with error code 1. If False, just split the long sentences.') - parser.add_argument('--sent-end-marker', type=str, default="DOTDOTDOT") - parser.add_argument("in_text", type=str, help="Input text") - parser.add_argument("out_text", type=str, help="Output text") - parser.add_argument("sent_bounds", type=str, + parser.add_argument('--sent-end-marker', default="DOTDOTDOT") + parser.add_argument("in_text", help="Input text") + parser.add_argument("out_text", help="Output text") + parser.add_argument("sent_bounds", help="A file that will contain a comma separated list of numbers, s.t. if" + "i is in this list, then there is a sententence break after token i") return parser.parse_args() @@ -66,7 +66,7 @@ def parse_args(): n_tokens += 1 start_scan = 4 current_line.append('SUN') - for i in xrange(start_scan, len(opl_tokens)): + for i in range(start_scan, len(opl_tokens)): m = re.match("^[A-Z]+\'?[A-Z\']*$", opl_tokens[i]) if m is not None: n_tokens += 1 diff --git a/egs/librispeech/s5/local/lm/python/text_pre_process.py b/egs/librispeech/s5/local/lm/python/text_pre_process.py index 6228079b3a3..b75d0711d13 100755 --- a/egs/librispeech/s5/local/lm/python/text_pre_process.py +++ b/egs/librispeech/s5/local/lm/python/text_pre_process.py @@ -20,13 +20,13 @@ def parse_args(): parser = argparse.ArgumentParser(description="Pre-process a book's text") - parser.add_argument("--in-encoding", type=str, default="utf-8", + parser.add_argument("--in-encoding", default="utf-8", help="Encoding to use when reading the input text") - parser.add_argument("--out-encoding", type=str, default="ascii", + parser.add_argument("--out-encoding", default="ascii", help="Encoding to use when writing the output text") - parser.add_argument('--sent-end-marker', type=str, default="DOTDOTDOT") - parser.add_argument("in_text", type=str, help="Input text") - parser.add_argument("out_text", type=str, help="Output text") + parser.add_argument('--sent-end-marker', default="DOTDOTDOT") + parser.add_argument("in_text", help="Input text") + parser.add_argument("out_text", help="Output text") return parser.parse_args() # http://rosettacode.org/wiki/Roman_numerals/Decode#Python diff --git a/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh index 257e497017b..137a972f3d9 100755 --- a/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh +++ b/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh @@ -3,23 +3,23 @@ # Copyright 2012 Johns Hopkins University (author: Daniel Povey) # 2018 Ke Li -# This script trains LMs on the librispeech 960 hours training data. +# This script trains LMs on the librispeech-lm-norm.txt.gz. -# rnnlm/train_rnnlm.sh: best iteration (out of 26) was 21, linking it to final iteration. -# rnnlm/train_rnnlm.sh: train/dev perplexity was 118.4 / 152.6. -# Train objf: -5.74 -5.51 -5.38 -5.29 -5.22 -5.16 -5.12 -5.08 -5.05 -5.02 -4.99 -4.97 -4.97 -4.93 -4.90 -4.87 -4.84 -4.82 -4.79 -4.77 -4.75 -4.73 -4.71 -4.69 -4.67 -# Dev objf: -6.00 -5.61 -5.45 -5.36 -5.29 -5.24 -5.20 -5.18 -5.16 -5.13 -5.12 -5.11 -5.11 -5.09 -5.07 -5.06 -5.05 -5.04 -5.03 -5.03 -5.03 -5.03 -5.03 -5.03 -5.03 -5.03 +# rnnlm/train_rnnlm.sh: best iteration (out of 143) was 142, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 109.2 / 110.7. +# Train objf: -5.74 -5.54 -5.44 -5.37 -5.32 -5.28 -5.25 -5.23 -5.20 -5.18 -5.15 -5.14 -5.12 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.96 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.92 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.78 -4.79 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.70 -4.70 -4.70 -4.70 -4.70 -4.69 -4.69 -4.69 -4.69 -4.69 -4.69 -4.68 -4.68 +# Dev objf: -5.99 -5.65 -5.53 -5.44 -5.38 -5.34 -5.30 -5.27 -5.22 -5.20 -5.18 -5.16 -5.14 -5.12 -5.11 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.97 0.00 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.91 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.87 -4.84 -4.84 -4.84 -4.83 -4.91 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.79 -4.79 -4.78 -4.78 -4.79 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 # WER summary on dev and test sets # System tdnn_1d_sp +lattice_rescore +nbest_rescore -# WER on dev(fglarge) 3.34 2.97 2.98 -# WER on dev(tglarge) 3.44 3.02 3.07 -# WER on dev_other(fglarge) 8.70 7.98 8.00 -# WER on dev_other(tglarge) 9.25 8.28 8.35 -# WER on test(fglarge) 3.77 3.41 3.40 -# WER on test(tglarge) 3.85 3.50 3.47 -# WER on test_other(fglarge) 8.91 8.22 8.21 -# WER on test_other(tglarge) 9.31 8.55 8.49 +# WER on dev(fglarge) 3.34 2.71 2.62 +# WER on dev(tglarge) 3.44 2.75 2.66 +# WER on dev_other(fglarge) 8.70 7.37 7.55 +# WER on dev_other(tglarge) 9.25 7.56 7.73 +# WER on test(fglarge) 3.77 3.12 3.06 +# WER on test(tglarge) 3.85 3.18 3.11 +# WER on test_other(fglarge) 8.91 7.63 7.68 +# WER on test_other(tglarge) 9.31 7.83 7.95 # command to get the WERs above: # tdnn_1d_sp @@ -37,7 +37,7 @@ lstm_rpd=256 lstm_nrpd=256 stage=-10 train_stage=-10 -epochs=20 +epochs=4 # variables for lattice rescoring run_lat_rescore=true @@ -54,23 +54,25 @@ pruned_rescore=true . ./cmd.sh . ./utils/parse_options.sh -# test of 960 hours training transcriptions -text=data/train_960/text +text=data/local/lm/librispeech-lm-norm.txt.gz lexicon=data/lang_nosp/words.txt -text_dir=data/rnnlm/text_960_1a +text_dir=data/rnnlm/text mkdir -p $dir/config set -e -for f in $text $lexicon; do +for f in $lexicon; do [ ! -f $f ] && \ echo "$0: expected file $f to exist; search for run.sh in run.sh" && exit 1 done if [ $stage -le 0 ]; then mkdir -p $text_dir + if [ ! -f $text ]; then + wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm + fi echo -n >$text_dir/dev.txt - # hold out one in every 50 lines as dev data. - cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/librispeech.txt + # hold out one in every 2000 lines as dev data. + gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/librispeech.txt fi if [ $stage -le 1 ]; then @@ -119,7 +121,7 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then - rnnlm/train_rnnlm.sh --num-jobs-final 2 \ + rnnlm/train_rnnlm.sh --num-jobs-final 8 \ --stage $train_stage \ --num-epochs $epochs \ --cmd "$train_cmd" $dir diff --git a/egs/madcat_ar/v1/README.txt b/egs/madcat_ar/README.txt similarity index 100% rename from egs/madcat_ar/v1/README.txt rename to egs/madcat_ar/README.txt diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index 778555c427e..650a0704d80 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -13,6 +13,7 @@ be vertically or horizontally aligned). Hence to extract line image from line bounding box, page image is rotated and line image is cropped and saved. """ +from __future__ import division import sys import argparse @@ -87,8 +88,8 @@ def unit_vector(pt0, pt1): (float, float): unit vector """ dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) - return (pt1[0] - pt0[0]) / dis_0_to_1, \ - (pt1[1] - pt0[1]) / dis_0_to_1 + return (pt1[0] - pt0[0])/ dis_0_to_1, \ + (pt1[1] - pt0[1])/ dis_0_to_1 def orthogonal_vector(vector): @@ -130,7 +131,7 @@ def bounding_area(index, hull): return {'area': len_p * len_o, 'length_parallel': len_p, 'length_orthogonal': len_o, - 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'rectangle_center': (min_p + float(len_p)/ 2, min_o + float(len_o)/ 2), 'unit_vector': unit_vector_p, } @@ -143,7 +144,7 @@ def to_xy_coordinates(unit_vector_angle, point): ------ (float, float): converted x,y coordinate of the unit vector. """ - angle_orthogonal = unit_vector_angle + pi / 2 + angle_orthogonal = unit_vector_angle + pi/ 2 return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) @@ -235,8 +236,8 @@ def get_center(im): ------- (int, int): center of the image """ - center_x = im.size[0] / 2 - center_y = im.size[1] / 2 + center_x = float(im.size[0])/ 2 + center_y = float(im.size[1])/ 2 return int(center_x), int(center_y) @@ -248,9 +249,9 @@ def get_horizontal_angle(unit_vector_angle): (float): updated angle of the unit vector to be in radians. It is only in first or fourth quadrant. """ - if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + if unit_vector_angle > pi/ 2 and unit_vector_angle <= pi: unit_vector_angle = unit_vector_angle - pi - elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + elif unit_vector_angle > -pi and unit_vector_angle < -pi/ 2: unit_vector_angle = unit_vector_angle + pi return unit_vector_angle @@ -354,7 +355,7 @@ def dilate_polygon(points, amount_increase): bisect = np.divide(bisect, np.linalg.norm(bisect)) cos_theta = np.dot(next_normal, bisect) - hyp = amount_increase / cos_theta + hyp = float(amount_increase)/ cos_theta new_point = np.around(point + hyp * bisect) new_point = new_point.astype(int) diff --git a/egs/madcat_ar/v1/local/download_data.sh b/egs/madcat_ar/v1/local/download_data.sh deleted file mode 100755 index 7061be49c2a..00000000000 --- a/egs/madcat_ar/v1/local/download_data.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -# Copyright 2018 Ashish Arora -# Apache 2.0 - -# This script downloads data splits for MADCAT Arabic dataset. -# It also check if madcat arabic data is present or not. - -download_dir1=/export/corpora/LDC/LDC2012T15/data -download_dir2=/export/corpora/LDC/LDC2013T09/data -download_dir3=/export/corpora/LDC/LDC2013T15/data -train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid -test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid -dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid -data_splits=data/download/data_splits - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh || exit 1; - -if [ -d $data_splits ]; then - echo "$0: Not downloading the data splits as it is already there." -else - if [ ! -f $data_splits/madcat.train.raw.lineid ]; then - mkdir -p $data_splits - echo "$0: Downloading the data splits..." - wget -P $data_splits $train_split_url || exit 1; - wget -P $data_splits $test_split_url || exit 1; - wget -P $data_splits $dev_split_url || exit 1; - fi - echo "$0: Done downloading the data splits" -fi - -if [ -d $download_dir1 ]; then - echo "$0: madcat arabic data directory is present." -else - if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then - echo "$0: please download madcat data..." - fi -fi diff --git a/egs/madcat_ar/v1/local/prepare_data.sh b/egs/madcat_ar/v1/local/prepare_data.sh new file mode 100755 index 00000000000..1049db9826d --- /dev/null +++ b/egs/madcat_ar/v1/local/prepare_data.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script downloads the data splits for MADCAT Arabic dataset and prepares the training +# validation, and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also uses Arabic Gigaword text corpus for language modeling. + +# Eg. local/prepare_data.sh +# Eg. text file: LDC0001_000399_NHR_ARB_20070113.0052_11_LDC0001_0z11 +# وهناك تداخل بين الرأسمالية الإسرائيلية +# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 +# images.scp file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 +# data/local/train/1/NHR_ARB_20070113.0052_11_LDC0001_00z1.png + +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid +test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid +dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid +data_splits=data/download/data_splits +stage=0 +download_dir=data/download +gigacorpus=data/local/gigawordcorpus +gigaword_loc=/export/corpora5/LDC/LDC2011T11 +use_extra_corpus_text=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +if [ -d $data_splits ]; then + echo "$0: Not downloading the data splits as it is already there." +else + if [ ! -f $data_splits/madcat.train.raw.lineid ]; then + mkdir -p $data_splits + echo "$0: Downloading the data splits..." + wget -P $data_splits $train_split_url || exit 1; + wget -P $data_splits $test_split_url || exit 1; + wget -P $data_splits $dev_split_url || exit 1; + fi + echo "$0: Done downloading the data splits" +fi + +if [ -d $download_dir1 ]; then + echo "$0: madcat arabic data directory is present." +else + if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then + echo "$0: please download madcat data..." + fi +fi + +mkdir -p $download_dir data/local +if $use_extra_corpus_text; then + mkdir -p $gigacorpus + cp -r $gigaword_loc/. $gigacorpus + for newswire in aaw_arb afp_arb ahr_arb asb_arb hyt_arb nhr_arb qds_arb umh_arb xin_arb; do + for file in $gigacorpus/arb_gw_5/data/$newswire/*.gz; do + gzip -d $file + done + for file in $gigacorpus/arb_gw_5/data/$newswire/*; do + sed -e '/^<[^>]*>$/d; s/``/"/g; s/\x27\x27/"/g' $file >> $gigacorpus/arb_gw_5/data/${newswire}_combined.txt + done + done +fi diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py index e476b67cb96..71f7f39d632 100755 --- a/egs/madcat_ar/v1/local/process_data.py +++ b/egs/madcat_ar/v1/local/process_data.py @@ -24,23 +24,23 @@ " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " " data/train data/local/lines ", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('database_path1', type=str, +parser.add_argument('database_path1', help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('database_path2', type=str, +parser.add_argument('database_path2', help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('database_path3', type=str, +parser.add_argument('database_path3', help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('data_splits', type=str, +parser.add_argument('data_splits', help='Path to file that contains the train/test/dev split information') -parser.add_argument('out_dir', type=str, +parser.add_argument('out_dir', help='directory location to write output files.') -parser.add_argument('images_scp_path', type=str, +parser.add_argument('images_scp_path', help='Path of input images.scp file(maps line image and location)') -parser.add_argument('writing_condition1', type=str, +parser.add_argument('writing_condition1', help='Path to the downloaded (and extracted) writing conditions file 1') -parser.add_argument('writing_condition2', type=str, +parser.add_argument('writing_condition2', help='Path to the downloaded (and extracted) writing conditions file 2') -parser.add_argument('writing_condition3', type=str, +parser.add_argument('writing_condition3', help='Path to the downloaded (and extracted) writing conditions file 3') parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, help="performs image augmentation") @@ -192,25 +192,25 @@ def get_line_image_location(): if args.augment: key = (line_id + '.')[:-1] for i in range(0, 3): - location_id = '_' + line_id + '_scale' + str(i) + location_id = "_{}_scale{}".format(line_id, i) line_image_file_name = base_name + location_id + '.png' location = image_loc_dict[line_image_file_name] image_file_path = os.path.join(location, line_image_file_name) line = text_line_word_dict[key] text = ' '.join(line) base_line_image_file_name = line_image_file_name.split('.png')[0] - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_line_image_file_name + utt_id = "{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_line_image_file_name) text_fh.write(utt_id + ' ' + text + '\n') utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') image_fh.write(utt_id + ' ' + image_file_path + '\n') image_num += 1 else: - updated_base_name = base_name + '_' + str(line_id).zfill(4) +'.png' + updated_base_name = "{}_{}.png".format(base_name, str(line_id).zfill(4)) location = image_loc_dict[updated_base_name] image_file_path = os.path.join(location, updated_base_name) line = text_line_word_dict[line_id] text = ' '.join(line) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(line_id).zfill(4) + utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_line_image_file_name, str(line_id).zfill(4)) text_fh.write(utt_id + ' ' + text + '\n') utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index de67e444f39..bb2b4f86db1 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -19,6 +19,7 @@ images_scp_dir=data/local overwrite=false subset=false augment=false +use_extra_corpus_text=true . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -35,9 +36,9 @@ if [ $stage -le 0 ]; then echo "Exiting with status 1 to avoid data corruption" exit 1; fi - echo "$0: Downloading data splits...$(date)" - local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ - --download_dir2 $download_dir2 --download_dir3 $download_dir3 + local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 \ + --use_extra_corpus_text $use_extra_corpus_text for set in test train dev; do data_split_file=$data_splits_dir/madcat.$set.raw.lineid @@ -48,7 +49,7 @@ if [ $stage -le 0 ]; then --data data/local/$set --subset $subset --augment $augment || exit 1 done - echo "$0: Preparing data..." + echo "$0: Processing data..." for set in dev train test; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ diff --git a/egs/madcat_zh/v1/local/create_line_image_from_page_image.py b/egs/madcat_zh/v1/local/create_line_image_from_page_image.py index be0afe6d9fc..22af571fc04 100755 --- a/egs/madcat_zh/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_zh/v1/local/create_line_image_from_page_image.py @@ -76,8 +76,8 @@ def unit_vector(pt0, pt1): Eg. 0.31622776601683794, 0.9486832980505138 """ dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) - return (pt1[0] - pt0[0]) / dis_0_to_1, \ - (pt1[1] - pt0[1]) / dis_0_to_1 + return (pt1[0] - pt0[0])/ dis_0_to_1, \ + (pt1[1] - pt0[1])/ dis_0_to_1 def orthogonal_vector(vector): @@ -124,7 +124,7 @@ def bounding_area(index, hull): return {'area': len_p * len_o, 'length_parallel': len_p, 'length_orthogonal': len_o, - 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'rectangle_center': (min_p + float(len_p)/ 2, min_o + float(len_o)/ 2), 'unit_vector': unit_vector_p, } @@ -140,7 +140,7 @@ def to_xy_coordinates(unit_vector_angle, point): (float, float): converted x,y coordinate of the unit vector. Eg. 0.680742447866183, 2.1299271629971663 """ - angle_orthogonal = unit_vector_angle + pi / 2 + angle_orthogonal = unit_vector_angle + pi/ 2 return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) @@ -246,8 +246,8 @@ def get_center(im): (int, int): center of the image Eg. 2550, 3300 """ - center_x = im.size[0] / 2 - center_y = im.size[1] / 2 + center_x = float(im.size[0])/ 2 + center_y = float(im.size[1])/ 2 return int(center_x), int(center_y) @@ -262,9 +262,9 @@ def get_horizontal_angle(unit_vector_angle): Eg. 0.01543. """ - if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + if unit_vector_angle > pi/ 2 and unit_vector_angle <= pi: unit_vector_angle = unit_vector_angle - pi - elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + elif unit_vector_angle > -pi and unit_vector_angle < -pi/ 2: unit_vector_angle = unit_vector_angle + pi return unit_vector_angle diff --git a/egs/madcat_zh/v1/local/process_data.py b/egs/madcat_zh/v1/local/process_data.py index dbee815953a..994a4486420 100755 --- a/egs/madcat_zh/v1/local/process_data.py +++ b/egs/madcat_zh/v1/local/process_data.py @@ -23,11 +23,11 @@ " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " " data/train data/local/lines ", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('database_path1', type=str, +parser.add_argument('database_path1', help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('data_splits', type=str, +parser.add_argument('data_splits', help='Path to file that contains the train/test/dev split information') -parser.add_argument('out_dir', type=str, +parser.add_argument('out_dir', help='directory location to write output files.') args = parser.parse_args() @@ -185,12 +185,12 @@ def get_line_image_location(): base_name = os.path.basename(image_file_path) base_name, b = base_name.split('.tif') for lineID in sorted(text_line_word_dict): - updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' + updated_base_name = "{}_{}.png".format(base_name, str(lineID).zfill(4)) location = image_loc_dict[updated_base_name] image_file_path = os.path.join(location, updated_base_name) line = text_line_word_dict[lineID] text = ' '.join(''.join(line)) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) + utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_name, str(lineID).zfill(4)) text_fh.write(utt_id + ' ' + text + '\n') utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh index 382f9f4f6c6..1ec4a0d575b 100755 --- a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh +++ b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh @@ -96,7 +96,7 @@ if [ $stage -le 4 ]; then if $run_g2p; then steps/dict/apply_g2p.sh $tree_dir/extvocab_nosp_lexicon/words $tree_dir/extvocab_nosp_g2p $tree_dir/extvocab_nosp_lexicon else - cat <$tree_dir/extvocab_nosp_lexicon//lexicon.lex + cat <$tree_dir/extvocab_nosp_lexicon/lexicon.lex HARDWIGG 0.962436 HH AA1 R D W IH1 G SUDVESTR 0.162048 S AH1 D V EY1 S T R SUDVESTR 0.133349 S AH1 D V EH1 S T R diff --git a/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py b/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py index 3c447c5976a..75cc4458d85 100755 --- a/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py +++ b/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py @@ -10,6 +10,7 @@ # en_4156 B 414.58 0.16 l # en_4156 B 414.74 0.17 a +from __future__ import division import argparse,re __author__ = 'Minhua Wu' @@ -27,7 +28,7 @@ if items[4].find(".") != -1: letters = items[4].split("._") acronym_period = round(float(items[3]), 2) - letter_slot = round(acronym_period / len(letters), 2) + letter_slot = round(acronym_period/len(letters), 2) time_start = round(float(items[2]), 2) for l in letters[:-1]: time = " %.2f %.2f " % (time_start, letter_slot) diff --git a/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py b/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py index 59814beb4ea..8438bbdaf81 100755 --- a/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py +++ b/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py @@ -10,6 +10,7 @@ # en_4156 B 414.58 0.16 l # en_4156 B 414.74 0.17 a +from __future__ import division import argparse,re __author__ = 'Minhua Wu' @@ -27,7 +28,7 @@ if items[4].find(".") != -1: letters = items[4].split("._") acronym_period = round(float(items[3]), 2) - letter_slot = round(acronym_period / len(letters), 2) + letter_slot = round(acronym_period/len(letters), 2) time_start = round(float(items[2]), 2) for l in letters[:-1]: time = " %.2f %.2f " % (time_start, letter_slot) diff --git a/egs/multi_en/s5/local/normalize_transcript.py b/egs/multi_en/s5/local/normalize_transcript.py index 4572f4d658d..c640723a885 100755 --- a/egs/multi_en/s5/local/normalize_transcript.py +++ b/egs/multi_en/s5/local/normalize_transcript.py @@ -7,6 +7,7 @@ # This script normalizes the given "text" (transcript) file. The normalized result # is printed to STDOUT. This normalization should be applied to all corpora. +from __future__ import print_function import re import sys @@ -26,7 +27,7 @@ def normalize(utt): def main(): if len(sys.argv) != 2: - print 'Usage: local/normalize_transcript.py [text_file]' + print('Usage: local/normalize_transcript.py [text_file]') sys.exit(1) with open(sys.argv[1], 'r') as f: for line in f.readlines(): diff --git a/egs/multi_en/s5/local/tedlium_join_suffix.py b/egs/multi_en/s5/local/tedlium_join_suffix.py index c85e8f364f6..47db4ce0b05 100755 --- a/egs/multi_en/s5/local/tedlium_join_suffix.py +++ b/egs/multi_en/s5/local/tedlium_join_suffix.py @@ -12,6 +12,7 @@ # Apache 2.0 +from __future__ import print_function import sys from codecs import open diff --git a/egs/rimes/README.txt b/egs/rimes/README.txt new file mode 100644 index 00000000000..d201c5fec4e --- /dev/null +++ b/egs/rimes/README.txt @@ -0,0 +1,13 @@ +Rimes is a French handwriting recognition database created by A2iA. +The database was created by asking individuals to write letters on a given scenario like +a change of personal information, payment difficulty, damage declaration. The +dataset has been used in several international research including ICFHR 2008, +ICDAR-2009, ICDAR-2011 competitions for isolated word level and +line level recognition tasks. + +It contains 11333 training lines and 788 test lines. It does not include +a validation split but in a recent publication a 10% sampling of the total +training lines for validation purposes were performed +(http://www.jpuigcerver.net/pubs/jpuigcerver_icdar2017.pdf). +We have used a similar train, test and validation split. +More info: http://www.a2ialab.com/doku.php?id=rimes_database:start diff --git a/egs/rimes/v1/cmd.sh b/egs/rimes/v1/cmd.sh new file mode 100755 index 00000000000..6080a8bab68 --- /dev/null +++ b/egs/rimes/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="retry.pl queue.pl" diff --git a/egs/rimes/v1/image b/egs/rimes/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/rimes/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/rimes/v1/local/chain/compare_wer.sh b/egs/rimes/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..4a2cc29481c --- /dev/null +++ b/egs/rimes/v1/local/chain/compare_wer.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi +. ./path.sh + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# WER val " +for x in $*; do + wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER val " +for x in $*; do + cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/rimes/v1/local/chain/run_cnn_e2eali.sh b/egs/rimes/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..e2545b0186e --- /dev/null +++ b/egs/rimes/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1a.sh \ No newline at end of file diff --git a/egs/rimes/v1/local/chain/run_e2e_cnn.sh b/egs/rimes/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/rimes/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..4eb3e5e1e76 --- /dev/null +++ b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -0,0 +1,257 @@ +#!/bin/bash + +# e2eali_1a is a 6 cnn layer 3 tdnn layer model with dropout, l2-regularization, batch-normalization + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a +# System cnn_e2eali_1a +# WER 7.75 +# CER 2.68 +# Final train prob -0.0779 +# Final valid prob -0.0860 +# Final train prob (xent) -0.7744 +# Final valid prob (xent) -0.8111 +# Parameters 4.96M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a +# exp/chain/cnn_e2eali_1a: num-iters=36 nj=3..8 num-params=5.0M dim=40->944 combine=-0.076->-0.076 (over 1) xent:train/valid[23,35,final]=(-1.48,-0.871,-0.774/-1.46,-0.888,-0.811) logprob:train/valid[23,35,final]=(-0.208,-0.102,-0.078/-0.189,-0.104,-0.086) + +# line level scoring result +# WER 7.75 [ 437 / 5639, 62 ins, 55 del, 320 sub ] exp/chain/cnn_e2eali_1d/decode_test/wer_7_1.0 +# paragraph scoring result +# WER 6.69 [ 377 / 5639, 44 ins, 37 del, 296 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_7_1.0 + +set -e -o pipefail + +stage=0 + +nj=50 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +tree_affix=_1a +bnf_chain_model_dir=exp/chain/e2e_cnn_1a +bnf_layer_name=tdnn6.affine +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=1000 +# we don't need extra left/right context for TDNN systems. +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +if $decode_val; then maybe_val=val; else maybe_val= ; fi +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 --generate-ali-from-lats true \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +bnf_data_dir=$bnf_chain_model_dir/$(basename $train_data_dir) +if [ $stage -le 3 ]; then + if [ -f $bnf_data_dir/feats.scp ]; then + echo "$0: $bnf_data_dir/feats.scp exists. Refusing to dump features!" + exit 1 + fi + + steps/nnet3/make_bottleneck_features.sh --cmd "$cmd" --use-gpu true \ + --compress false --nj $nj \ + $bnf_layer_name ${train_data_dir} ${bnf_data_dir} $bnf_chain_model_dir || exit 1 +fi + +if [ $stage -le 4 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${bnf_data_dir} \ + $lang $lat_dir $tree_dir +fi + + +if [ $stage -le 5 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 6 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 7 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 8 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh new file mode 100755 index 00000000000..9d28a41316d --- /dev/null +++ b/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# System e2e_cnn_1d +# WER 10.07 +# CER 3.95 +# Final train prob 0.0369 +# Final valid prob -0.0129 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 12.73M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=20 nj=2..4 num-params=12.7M dim=40->19404 combine=0.079->0.079 (over 3) logprob:train/valid[12,19,final]=(0.017,0.034,0.037/-0.024,-0.013,-0.013) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a +nj=50 + +# training options +tdnn_dim=450 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +train_set=train +decode_val=true +lang_decode=data/lang +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn6 input=Append(-4,0,4) dim=200 + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/rimes/v1/local/combine_line_txt_to_paragraph.py b/egs/rimes/v1/local/combine_line_txt_to_paragraph.py new file mode 100755 index 00000000000..5a794506b47 --- /dev/null +++ b/egs/rimes/v1/local/combine_line_txt_to_paragraph.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +""" This script creates paragraph level text file. It reads + the line level text file and combines them to get + paragraph level file. + Eg. local/combine_line_txt_to_paragraph.py + Eg. Input: writer000000_eval2011-0_000001 Comme indiqué dans + writer000000_eval2011-0_000002 habitation n° DVT 36 + writer000000_eval2011-0_000003 de mon domicile + Output: writer000000_eval2011-0 Comme indiqué dans habitation n° DVT 36 de mon domicile +""" + +import argparse +import os +import io +import sys +### main ### +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +paragraph_txt_dict = dict() +for line in infile: + line_vect = line.strip().split(' ') + line_id = int(line_vect[0].split('_')[-1]) + paragraph_id = line_vect[0].split('-')[-1] + paragraph_id = int(paragraph_id.split('_')[0]) + line_text = " ".join(line_vect[1:]) + if paragraph_id not in paragraph_txt_dict.keys(): + paragraph_txt_dict[paragraph_id] = dict() + paragraph_txt_dict[paragraph_id][line_id] = line_text + + +para_txt_dict = dict() +for para_id in sorted(paragraph_txt_dict.keys()): + para_txt = "" + for line_id in sorted(paragraph_txt_dict[para_id]): + text = paragraph_txt_dict[para_id][line_id] + para_txt = para_txt + " " + text + para_txt_dict[para_id] = para_txt + utt_id = 'writer' + str(para_id).zfill(6) + '_' + 'eval2011-' + str(para_id) + output.write(utt_id + ' ' + para_txt + '\n') diff --git a/egs/rimes/v1/local/extract_features.sh b/egs/rimes/v1/local/extract_features.sh new file mode 100755 index 00000000000..ec3bc8a268c --- /dev/null +++ b/egs/rimes/v1/local/extract_features.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment_type=no_aug +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/rimes/v1/local/prepare_data.sh b/egs/rimes/v1/local/prepare_data.sh new file mode 100755 index 00000000000..502718e7777 --- /dev/null +++ b/egs/rimes/v1/local/prepare_data.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# This script creates traing and validations splits, downloads text corpus for language modeling, +# prepares the training, validation and test data for rimes dataset +# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py. + +# Eg. local/prepare_data.sh +# Eg. text file: writer000150_train2011-150_000001 J'ai perdu mon emploi depuis 3 mois et je me +# utt2spk file: writer000150_train2011-150_000001 writer000150 +# images.scp file: writer000150_train2011-150_000001 data/local/rimes_data/line_image/train/train2011-150_000001.png + +stage=0 +download_dir=data/local/rimes_data +data_dir=data/local/rimes_data +page_image=$data_dir/page_image +xml=$data_dir/xml +train_img_url="http://www.a2ialab.com/lib/exe/fetch.php?media=rimes_database:data:icdar2011:line:training_2011.tar"; +train_xml_url="http://www.a2ialab.com/lib/exe/fetch.php?media=rimes_database:data:icdar2011:line:training_2011.xml"; +test_xml_url="http://www.a2ialab.com/lib/exe/fetch.php?media=rimes_database:data:icdar2011:line:eval_2011_annotated.xml"; +test_img_url="http://www.a2ialab.com/lib/exe/fetch.php?media=rimes_database:data:icdar2011:line:eval_2011.tar"; +text_url="http://opus.nlpl.eu/download.php?f=OfisPublik.tar.gz" +use_extra_corpus_text=true +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +mkdir -p data/{train,test,val} + +if [ -d $page_image ]; then + echo "$0: Not downloading data as it is already there." +else + mkdir -p $data_dir/{page_image,xml,line_image}/{train_total,test,val,train} + tar -xf $download_dir/training_2011.tar -C $page_image/train_total || exit 1; + tar -xf $download_dir/eval_2011.tar -C $page_image/test || exit 1; + cp -r $download_dir/training_2011.xml $xml/train_total/rimes_2011.xml + cp -r $download_dir/eval_2011_annotated.xml $xml/test/rimes_2011.xml + echo "$0: Done downloading and extracting data" + + #First 150 training page images are used for validation + cat $xml/train_total/rimes_2011.xml | head -n451 > $xml/val/rimes_2011.xml + cat $xml/train_total/rimes_2011.xml | tail -1 >> $xml/val/rimes_2011.xml + cp -r $page_image/train_total/* $page_image/train + + #Remaining training page images are used for training + cat $xml/train_total/rimes_2011.xml | head -1 > $xml/train/rimes_2011.xml + cat $xml/train_total/rimes_2011.xml | tail -n+452 >> $xml/train/rimes_2011.xml + cp -r $page_image/train_total/* $page_image/val +fi + +if $use_extra_corpus_text; then + # using freely available french text corpus for language modeling + mkdir -p data/local/text_data + wget -P data/local/text_data $text_url || exit 1; + tar -xf data/local/text_data/download.php?f=OfisPublik.tar.gz -C data/local/text_data || exit 1; + zcat data/local/text_data/OfisPublik/raw/fr/*.gz > data/local/text_data/fr_text +fi + +if [ $stage -le 0 ]; then + echo "$0: Processing train, val and test data... $(date)." + local/process_data.py $data_dir train --augment true || exit 1 + local/process_data.py $data_dir val || exit 1 + local/process_data.py $data_dir test || exit 1 + for dataset in test train val; do + echo "$0: Fixing data directory for dataset: $dataset $(date)." + image/fix_data_dir.sh data/$dataset + done +fi diff --git a/egs/rimes/v1/local/prepare_dict.sh b/egs/rimes/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..d8093658c30 --- /dev/null +++ b/egs/rimes/v1/local/prepare_dict.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +mkdir -p $dir + +local/prepare_lexicon.py $dir + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/rimes/v1/local/prepare_lexicon.py b/egs/rimes/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..5a6ac5b6dbf --- /dev/null +++ b/egs/rimes/v1/local/prepare_lexicon.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data', 'train', 'text') +text_fh = open(text_path, 'r', encoding='utf-8') + +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word + characters = " ".join(['SIL' if char == '|' else char for char in characters]) + lex[line_vect[i]] = characters + if line_vect[i] == '#': + lex[line_vect[i]] = "" + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/rimes/v1/local/process_data.py b/egs/rimes/v1/local/process_data.py new file mode 100755 index 00000000000..b87d9fbc5e2 --- /dev/null +++ b/egs/rimes/v1/local/process_data.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +""" This script reads xml file and creates the following files :text, utt2spk, images.scp. + It also creates line images from page image and stores it into + data/local/rimes_data/train/lines. + Eg. local/process_data.py data/local/rimes_data/train train + Eg. text file: writer000000_train2011-0_000001 Je vous adresse ce courrier afin + utt2spk file: writer000000_train2011-0_000001 writer000000 + images.scp file: writer000000_train2011-0_000001 \ + data/local/rimes_data/train/lines/train2011-0_000001.png +""" + +import argparse +import xml.dom.minidom as minidom +from PIL import Image +import os +import random +parser = argparse.ArgumentParser(description="""Creates line images from page image.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) mdacat data') +parser.add_argument('dataset', type=str, + help='Subset of data to process.') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") +parser.add_argument('--pixel-scaling', type=int, default=20, + help='padding across horizontal/verticle direction') +args = parser.parse_args() + +def expand_aabb(left, right, top, bottom, delta_pixel): + """ Increases size of axis aligned bounding box (aabb). + """ + left = left - delta_pixel + right = right + delta_pixel + top = top - delta_pixel + bottom = bottom + delta_pixel + return left, right, top, bottom + +def get_line_images_from_page_image(file_name, left, right, top, bottom, line_id): + """ Given a page image, extracts the line images from it. + Input + ----- + file_name (string): name of the page image. + left, right, top, bottom (int): coordinates corresponding to the line image. + line_id (int): line number on the page image. + """ + page_image_path = os.path.join(page_image_folder, file_name) + im = Image.open(page_image_path) + box = (left, top, right, bottom) + region = im.crop(box) + base_name = os.path.splitext(os.path.basename(file_name))[0] + line_image_file_name = base_name + '_' + str(line_id).zfill(6) + '.png' + imgray = region.convert('L') + line_image_path = os.path.join(args.database_path, 'line_image', args.dataset, line_image_file_name) + imgray.save(line_image_path) + return base_name, line_image_path + +def write_kaldi_process_data_files(base_name, line_id, text): + """creates files requires for dictionary and feats.scp. + Input + ----- + image_path (string): name of the page image. + line_id (str): line number on the page image. + text: transcription of the line image. + base_name (string): + """ + writer_id = str(base_name.split('-')[1]) + writer_id = str(writer_id).zfill(6) + writer_id = 'writer' + writer_id + utt_id = writer_id + '_' + base_name + '_' + str(line_id).zfill(6) + line_image_file_name = base_name + '_' + str(line_id).zfill(6) + '.png' + image_path = os.path.join(args.database_path, 'line_image', args.dataset, line_image_file_name) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_path + '\n') + +### main ### +text_file = os.path.join('data', args.dataset, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join('data', args.dataset, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join('data', args.dataset, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +xml_path = os.path.join(args.database_path, 'xml', args.dataset) + '/rimes_2011.xml' +page_image_folder = os.path.join(args.database_path, 'page_image', args.dataset) +doc = minidom.parse(xml_path) +single_page = doc.getElementsByTagName('SinglePage') +for page in single_page: + file_name = page.getAttribute('FileName') + line = page.getElementsByTagName('Line') + id = 0 + for node in line: + id += 1 + bottom = int(node.getAttribute('Bottom')) + left = int(node.getAttribute('Left')) + right = int(node.getAttribute('Right')) + top = int(node.getAttribute('Top')) + text = node.getAttribute('Value') + text_vect = text.split() # this is to avoid non-utf-8 spaces + text = " ".join(text_vect) + if args.augment: + base_name, image_path = get_line_images_from_page_image(file_name, left, right, top, bottom, str(id)) + write_kaldi_process_data_files(base_name, str(id), text) + additional_pixel = random.randint(1, args.pixel_scaling) + left, right, top, bottom = expand_aabb(left, right, top, bottom, args.pixel_scaling + additional_pixel + 1) + line_id = str(id) + '_scale' + str(2) + base_name, image_path = get_line_images_from_page_image(file_name, left, right, top, bottom, line_id) + write_kaldi_process_data_files(base_name, line_id, text) + else: + base_name, image_path = get_line_images_from_page_image(file_name, left, right, top, bottom, str(id)) + write_kaldi_process_data_files(base_name, str(id), text) diff --git a/egs/rimes/v1/local/score.sh b/egs/rimes/v1/local/score.sh new file mode 100755 index 00000000000..0cfbda9b556 --- /dev/null +++ b/egs/rimes/v1/local/score.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e +cmd=run.pl +stage=0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +decode_dir=$3 +steps/scoring/score_kaldi_wer.sh --word_ins_penalty $word_ins_penalty \ + --min_lmwt $min_lmwt --max_lmwt $max_lmwt "$@" + +steps/scoring/score_kaldi_cer.sh --word_ins_penalty $word_ins_penalty \ + --min_lmwt $min_lmwt --max_lmwt $max_lmwt --stage 2 "$@" + +local/score_paragraph.sh --word_ins_penalty $word_ins_penalty \ + --min_lmwt $min_lmwt --max_lmwt $max_lmwt $decode_dir diff --git a/egs/rimes/v1/local/score_paragraph.sh b/egs/rimes/v1/local/score_paragraph.sh new file mode 100755 index 00000000000..c6ef4da1d5b --- /dev/null +++ b/egs/rimes/v1/local/score_paragraph.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +min_lmwt=7 +max_lmwt=17 +word_ins_penalty=0.0,0.5,1.0 + +set -e +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +decode_dir=$1 +test_para=$decode_dir/scoring_kaldi/test_filt_para.txt + +cat $decode_dir/scoring_kaldi/test_filt.txt | \ + local/combine_line_txt_to_paragraph.py > $test_para + +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for LMWT in $(seq $min_lmwt $max_lmwt); do + mkdir -p $decode_dir/para/penalty_$wip + cat $decode_dir/scoring_kaldi/penalty_$wip/$LMWT.txt | \ + local/combine_line_txt_to_paragraph.py > $decode_dir/para/penalty_$wip/$LMWT.txt + done +done + +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for LMWT in $(seq $min_lmwt $max_lmwt); do + compute-wer --text --mode=present \ + ark:$test_para ark:$decode_dir/para/penalty_$wip/$LMWT.txt &> $decode_dir/para/wer_${LMWT}_${wip} || exit 1; + done +done + +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $decode_dir/para/wer_${lmwt}_${wip} /dev/null + done +done | utils/best_wer.sh >& $decode_dir/para/best_wer || exit 1 diff --git a/egs/rimes/v1/local/train_lm.sh b/egs/rimes/v1/local/train_lm.sh new file mode 100755 index 00000000000..51927b7a97e --- /dev/null +++ b/egs/rimes/v1/local/train_lm.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +order=6 +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + head -2000 data/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + tail -n +2000 data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + + if [ -d "data/local/text_data" ]; then + cat data/local/text_data/fr_text | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/corpus_text.txt + fi + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='corpus_text=2 train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/rimes/v1/local/wer_output_filter b/egs/rimes/v1/local/wer_output_filter new file mode 100755 index 00000000000..d9cf1f4072e --- /dev/null +++ b/egs/rimes/v1/local/wer_output_filter @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 +# This script converts a BPE-encoded text to normal text. It is used in scoring + +import sys, io +import string +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +for line in infile: + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + output.write(uttid + ' ' + transcript + '\n') diff --git a/egs/rimes/v1/path.sh b/egs/rimes/v1/path.sh new file mode 100755 index 00000000000..c7ebe7f2abf --- /dev/null +++ b/egs/rimes/v1/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH +export LC_ALL=C diff --git a/egs/rimes/v1/run_end2end.sh b/egs/rimes/v1/run_end2end.sh new file mode 100755 index 00000000000..d3e3da2be13 --- /dev/null +++ b/egs/rimes/v1/run_end2end.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# Copyright 2018 Hossein Hadian +# Ashish Arora +# Jonathan Chang +# Apache 2.0 + +set -e +stage=0 +nj=50 +overwrite=false +rimes_database=/export/corpora5/handwriting_ocr/RIMES +train_set=train +use_extra_corpus_text=true +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + +if [ $stage -le 0 ]; then + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$rimes_database" \ + --use_extra_corpus_text $use_extra_corpus_text + +fi + +mkdir -p data/{train,test,val}/data +if [ $stage -le 1 ]; then + echo "$(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$(date) Extracting features, creating feats.scp file" + for set in train test val; do + local/extract_features.sh --nj $nj --cmd "$cmd" data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 3 ]; then + echo "$0: Preparing BPE..." + # getting non-silence phones. + cut -d' ' -f2- data/train/text | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/phones.txt + + cut -d' ' -f2- data/train/text > data/local/train_data.txt + cat data/local/phones.txt data/local/train_data.txt | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + + for set in test train val; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids + done +fi + +if [ $stage -le 4 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 5 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang +fi + +if [ $stage -le 6 ]; then + echo "$0: Calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh --train_set $train_set +fi + +if [ $stage -le 7 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 8 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali.sh --train_set $train_set +fi diff --git a/egs/rimes/v1/steps b/egs/rimes/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/rimes/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/rimes/v1/utils b/egs/rimes/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/rimes/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/sitw/v1/local/make_musan.py b/egs/sitw/v1/local/make_musan.py index 74c434990fb..c4b5c9359b4 100755 --- a/egs/sitw/v1/local/make_musan.py +++ b/egs/sitw/v1/local/make_musan.py @@ -47,9 +47,9 @@ def prepare_music(root_dir, use_vocals): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_speech(root_dir): @@ -73,9 +73,9 @@ def prepare_speech(root_dir): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_noise(root_dir): @@ -99,9 +99,9 @@ def prepare_noise(root_dir): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print(("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def main(): diff --git a/egs/sprakbanken/s5/local/normalize_transcript.py b/egs/sprakbanken/s5/local/normalize_transcript.py index 2374418bee7..21d70864f04 100755 --- a/egs/sprakbanken/s5/local/normalize_transcript.py +++ b/egs/sprakbanken/s5/local/normalize_transcript.py @@ -17,8 +17,8 @@ "\t": " " } -from_chars = ''.join(normdict.keys()) -to_chars = ''.join(normdict.values()) +from_chars = ''.join(list(normdict.keys())) +to_chars = ''.join(list(normdict.values())) #t_table = maketrans(from_chars, to_chars) diff --git a/egs/sprakbanken/s5/local/sprak2kaldi.py b/egs/sprakbanken/s5/local/sprak2kaldi.py index f3abf1d9a38..5fa4baa1fa2 100755 --- a/egs/sprakbanken/s5/local/sprak2kaldi.py +++ b/egs/sprakbanken/s5/local/sprak2kaldi.py @@ -16,6 +16,7 @@ # limitations under the License. ''' +from __future__ import print_function import sys @@ -59,8 +60,8 @@ def create_parallel_file_list(session, sndlist, txtlist): if len(os.listdir(session.sessiondir)) != 0: # Check if there are files in the directory global n n += 1 - session.sessiondir = session.sessiondir + "_" + str(n) - session.speaker_id = session.speaker_id + "_" + str(n) + session.sessiondir = "{}_{}".format(session.sessiondir, n) + session.speaker_id = "{}_{}".format(session.speaker_id, n) os.mkdir(session.sessiondir) shadow = True else: diff --git a/egs/sprakbanken/s5/local/sprak2parallel.py b/egs/sprakbanken/s5/local/sprak2parallel.py index b5fe56fd60f..3dc82e30ac2 100755 --- a/egs/sprakbanken/s5/local/sprak2parallel.py +++ b/egs/sprakbanken/s5/local/sprak2parallel.py @@ -76,8 +76,8 @@ def make_speech_corpus(top, dest, srcfolder): session.sessiondir = os.path.join(dest, session.filestem) +"."+ session.speaker_id if os.path.exists(session.sessiondir): n += 1 - session.sessiondir = session.sessiondir+ "_" +str(n) - session.speaker_id+ "_" +str(n) + session.sessiondir = "{}_{}".format(session.sessiondir, n) + session.speaker_id = "{}_{}".format(session.speaker_id, n) os.mkdir(session.sessiondir) create_parallel_files(session) diff --git a/egs/sprakbanken/s5/local/sprakparser.py b/egs/sprakbanken/s5/local/sprakparser.py index 7bdf6ac94e3..1221cf0b023 100755 --- a/egs/sprakbanken/s5/local/sprakparser.py +++ b/egs/sprakbanken/s5/local/sprakparser.py @@ -22,11 +22,12 @@ ''' +from __future__ import print_function import codecs import os -class Session: +class Session(object): delimit = ">-<" @@ -151,7 +152,7 @@ def set_channel_vars(self, handle): pass def create_filename(self, uid, file_ending): - return self.filestem+ "." +self.speaker_id+ "." +str(uid)+ "." +file_ending + return "{}.{}.{}.{}".format(self.filestem, self.speaker_id, uid, file_ending) def wavpath(self, topfolder): prefix, suffix = topfolder.rsplit('/data/', 1) diff --git a/egs/sprakbanken/s5/local/writenumbers.py b/egs/sprakbanken/s5/local/writenumbers.py index df3235243d4..c419b3c7550 100755 --- a/egs/sprakbanken/s5/local/writenumbers.py +++ b/egs/sprakbanken/s5/local/writenumbers.py @@ -22,6 +22,7 @@ Changed to write output to file to prevent problems with shell ascii codec. ''' +from __future__ import print_function import sys import os @@ -215,7 +216,7 @@ def rmPvAnnotation(string): def normNumber(line, table): tokens = line.split() - keys = table.keys() + keys = list(table.keys()) for num, tok in enumerate(tokens): newtoks = splitNumeric(tok) if newtoks != False: diff --git a/egs/sprakbanken_swe/s5/local/normalize_transcript.py b/egs/sprakbanken_swe/s5/local/normalize_transcript.py index 90e45744e2a..150a9563aba 100755 --- a/egs/sprakbanken_swe/s5/local/normalize_transcript.py +++ b/egs/sprakbanken_swe/s5/local/normalize_transcript.py @@ -18,8 +18,8 @@ } #removes all the above signs -from_chars = ''.join(normdict.keys()) -to_chars = ''.join(normdict.values()) +from_chars = ''.join(list(normdict.keys())) +to_chars = ''.join(list(normdict.values())) t_table = str.maketrans(normdict) diff --git a/egs/sprakbanken_swe/s5/local/sprak2kaldi.py b/egs/sprakbanken_swe/s5/local/sprak2kaldi.py index cc67344c36e..8f723762e50 100755 --- a/egs/sprakbanken_swe/s5/local/sprak2kaldi.py +++ b/egs/sprakbanken_swe/s5/local/sprak2kaldi.py @@ -16,6 +16,7 @@ # limitations under the License. ''' +from __future__ import print_function import sys @@ -59,8 +60,8 @@ def create_parallel_file_list(session, sndlist, txtlist): if len(os.listdir(session.sessiondir)) != 0: # Check if there are files in the directory global n n += 1 - session.sessiondir = session.sessiondir + "_" + str(n) - session.speaker_id = session.speaker_id + "_" + str(n) + session.sessiondir = "{}_{}".format(session.sessiondir, n) + session.speaker_id = "{}_{}".format(session.speaker_id, n) os.mkdir(session.sessiondir) shadow = True else: diff --git a/egs/sprakbanken_swe/s5/local/sprakparser.py b/egs/sprakbanken_swe/s5/local/sprakparser.py index 4775328b56b..0951f7f39e7 100755 --- a/egs/sprakbanken_swe/s5/local/sprakparser.py +++ b/egs/sprakbanken_swe/s5/local/sprakparser.py @@ -26,7 +26,7 @@ import codecs import os -class Session: +class Session(object): delimit = ">-<" @@ -151,7 +151,7 @@ def set_channel_vars(self, handle): pass def create_filename(self, uid, file_ending): - return self.filestem+ "." +self.speaker_id+ "." +str(uid)+ "." +file_ending + return "{}.{}.{}.{}".format(self.filestem, self.speaker_id, uid, file_ending) def wavpath(self, topfolder): prefix, suffix = topfolder.rsplit('/data/', 1) diff --git a/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py b/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py index 72a4572d9a0..e1a4fc534e0 100755 --- a/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py +++ b/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py @@ -65,6 +65,7 @@ # We're using python 3.x style print but want it to work in python 2.x. from __future__ import print_function +from __future__ import division import re, os, argparse, sys, math, warnings, random def get_args(): @@ -196,7 +197,7 @@ def deterministic_chunk_length(archive_id, num_archives, min_frames_per_chunk, m elif num_archives == 1: return int(max_frames_per_chunk); else: - return int(math.pow(float(max_frames_per_chunk) / + return int(math.pow(float(max_frames_per_chunk)/ min_frames_per_chunk, float(archive_id) / (num_archives-1)) * min_frames_per_chunk + 0.5) @@ -247,7 +248,7 @@ def main(): length = deterministic_chunk_length(archive_index, args.num_archives, args.min_frames_per_chunk, args.max_frames_per_chunk); print("{0} {1}".format(archive_index + 1, length), file=info_f) archive_chunk_lengths.append(length) - this_num_egs = int((args.frames_per_iter / length) + 1) + this_num_egs = int(float(args.frames_per_iter) / length + 1) this_egs = [ ] # A 2-tuple of the form (utt-id, start-frame) spkrs = args.num_repeats * list(spk2utt.keys()) random.shuffle(spkrs) diff --git a/egs/sre10/v1/local/prepare_for_eer.py b/egs/sre10/v1/local/prepare_for_eer.py index 59d2985e7c2..bb4e666f0ab 100755 --- a/egs/sre10/v1/local/prepare_for_eer.py +++ b/egs/sre10/v1/local/prepare_for_eer.py @@ -1,3 +1,4 @@ +from __future__ import print_function # Copyright 2015 David Snyder # Apache 2.0. # @@ -12,4 +13,4 @@ spkrutt2target[spkr+utt]=target for line in scores: spkr, utt, score = line.strip().split() - print score, spkrutt2target[spkr+utt] + print("{} {}".format(score, spkrutt2target[spkr+utt])) diff --git a/egs/sre16/v1/local/make_musan.py b/egs/sre16/v1/local/make_musan.py index b3f6652ba40..7735bd28818 100755 --- a/egs/sre16/v1/local/make_musan.py +++ b/egs/sre16/v1/local/make_musan.py @@ -43,9 +43,9 @@ def prepare_music(root_dir, use_vocals): utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In music directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_speech(root_dir): @@ -69,9 +69,9 @@ def prepare_speech(root_dir): utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In speech directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_noise(root_dir): @@ -95,9 +95,9 @@ def prepare_noise(root_dir): utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In noise directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def main(): diff --git a/egs/svhn/v1/local/process_data.py b/egs/svhn/v1/local/process_data.py index f6ea85118f9..2a5bfc9a0d6 100755 --- a/egs/svhn/v1/local/process_data.py +++ b/egs/svhn/v1/local/process_data.py @@ -6,6 +6,7 @@ """ This script prepares the training and test data for SVHN. """ +from __future__ import division import argparse import os @@ -16,11 +17,11 @@ parser = argparse.ArgumentParser(description="""Converts train/test data of SVHN (Street View House Numbers) dataset to Kaldi feature format""") -parser.add_argument('matlab_file', type=str, +parser.add_argument('matlab_file', help='path to SVHN matlab data file (cropped version)') -parser.add_argument('dir', type=str, +parser.add_argument('dir', help='output dir') -parser.add_argument('--out-ark', type=str, +parser.add_argument('--out-ark', default='-', help='where to write output feature data') args = parser.parse_args() @@ -48,7 +49,7 @@ def write_kaldi_matrix(file_handle, matrix, key): if num_cols != len(matrix[row_index]): raise Exception("All the rows of a matrix are expected to " "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + file_handle.write(" ".join([str(x) for x in matrix[row_index]])) if row_index != num_rows - 1: file_handle.write("\n") file_handle.write(" ]\n") @@ -80,7 +81,7 @@ def zeropad(x, length): lbl = labels[i, 0] if lbl == 10: lbl = 0 - labels_fh.write(key + ' ' + str(lbl) + '\n') + labels_fh.write("{} {}\n".format(key, lbl)) img = data[i] write_kaldi_matrix(out_fh, img, key) img_id += 1 diff --git a/egs/swbd/s5c/local/map_acronyms_ctm.py b/egs/swbd/s5c/local/map_acronyms_ctm.py index bee488f73b0..7ae59d2a1d0 100755 --- a/egs/swbd/s5c/local/map_acronyms_ctm.py +++ b/egs/swbd/s5c/local/map_acronyms_ctm.py @@ -10,6 +10,7 @@ # en_4156 B 414.58 0.16 l # en_4156 B 414.74 0.17 a +from __future__ import division import argparse,re __author__ = 'Minhua Wu' diff --git a/egs/tedlium/s5/local/join_suffix.py b/egs/tedlium/s5/local/join_suffix.py index 64c62964331..c36b96a07f9 100755 --- a/egs/tedlium/s5/local/join_suffix.py +++ b/egs/tedlium/s5/local/join_suffix.py @@ -5,6 +5,7 @@ # Apache 2.0 +from __future__ import print_function import sys from codecs import open diff --git a/egs/tedlium/s5_r2/local/join_suffix.py b/egs/tedlium/s5_r2/local/join_suffix.py index 64c62964331..c36b96a07f9 100755 --- a/egs/tedlium/s5_r2/local/join_suffix.py +++ b/egs/tedlium/s5_r2/local/join_suffix.py @@ -5,6 +5,7 @@ # Apache 2.0 +from __future__ import print_function import sys from codecs import open diff --git a/egs/tedlium/s5_r2/local/run_learn_lex.sh b/egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh similarity index 98% rename from egs/tedlium/s5_r2/local/run_learn_lex.sh rename to egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh index a2a6f2e46b8..f1497bfe202 100755 --- a/egs/tedlium/s5_r2/local/run_learn_lex.sh +++ b/egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh @@ -2,7 +2,7 @@ # # This script demonstrates a lexicon learning recipe, which aims to imrove # the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes -# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon.sh +# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_bayesian.sh # for explanation of the options. # # Copyright 2016 Xiaohui Zhang @@ -78,7 +78,7 @@ fi # Learn a lexicon based on the acoustic training data and the reference lexicon. if [ $stage -le 1 ]; then - steps/dict/learn_lexicon.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \ + steps/dict/learn_lexicon_bayesian.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \ --min-prob $min_prob --variants-prob-mass $variants_prob_mass \ --variants-prob-mass-ref $variants_prob_mass_ref \ --prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \ diff --git a/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh new file mode 100755 index 00000000000..f69af3fe360 --- /dev/null +++ b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh @@ -0,0 +1,133 @@ +#! /bin/bash +# +# This script demonstrates a lexicon learning recipe, which aims to imrove +# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes +# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_greedy.sh +# for explanation of the options. +# +# Copyright 2018 Xiaohui Zhang +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +oov_symbol="" +# The user may have an phonetisaurus-trained English g2p model ready. +g2p_mdl_dir= +# The dir which contains the reference lexicon (most probably hand-derived) +# we want to expand/improve, and nonsilence_phones.txt,.etc which we need +# for building new dict dirs. +ref_dict=data/local/dict +# acoustic training data we use to get alternative +# pronunciations and collet acoustic evidence. +data=data/train +# the cut-off parameter used to select pronunciation candidates from phone +# decoding. We remove pronunciations with probabilities less than this value +# after normalizing the probs s.t. the max-prob is 1.0 for each word." +min_prob=0.1 +# Refer to steps/dict/select_prons_greedy.sh for the detailed meaning of +# alpha, beta and delta. Basically, the three dimensions of alpha +# and beta correspond to three pronunciation sources: phonetic- +# decoding, G2P and the reference lexicon, and the larger a value is, +# the more aggressive we'll prune pronunciations from that sooure. +# The valid range of each dim. is [0, 1] (for alpha, and 0 means +# we never pruned pron from that source.) [0, 100] (for beta). +alpha="0.04,0.02,0" +beta="30,5,0" +# Floor value of the pronunciation posterior statistics. +delta=0.00000001 +# This parameter determines how many pronunciations we keep for each word +# after the first pass pruning. See steps/dict/internal/prune_pron_candidates.py +# for details. +vcr=16 + +# Intermediate outputs of the lexicon learning stage will be put into dir +dir=exp/tri3_lex_greedy_work +nj=35 +decode_nj=30 +stage=0 +lexlearn_stage=0 +affix="learned_greedy" + +. utils/parse_options.sh # accept options + +# The reference vocab is the list of words which we already have hand-derived pronunciations. +ref_vocab=data/local/vocab.txt +cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1; + +# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon) +# in acoustic training data. +if [ $stage -le 0 ]; then + if [ -z $g2p_mdl_dir ]; then + g2p_mdl_dir=exp/g2p_phonetisaurus + steps/dict/train_g2p_phonetisaurus.sh $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1; + fi + awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \ + $data/text | sort -u > $data/train_vocab.txt || exit 1; + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \ + $data/train_vocab.txt | sort > $data/oov_train.txt || exit 1; + steps/dict/apply_g2p_phonetisaurus.sh --nbest 5 $data/train_vocab.txt $g2p_mdl_dir \ + exp/g2p_phonetisaurus/lex_train || exit 1; +fi + +# Learn a lexicon based on the acoustic training data and the reference lexicon. +if [ $stage -le 1 ]; then + steps/dict/learn_lexicon_greedy.sh --lexiconp-g2p "exp/g2p_phonetisaurus/lex_train/lexicon.lex" \ + --alpha $alpha --beta $beta --delta $delta \ + --min-prob $min_prob --cmd "$train_cmd" \ + --variant-counts-ratio $vcr \ + --stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl false \ + $ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_${affix}_nosp \ + $dir || exit 1; +fi + +# Add pronounciation probs to the learned lexicon. +if [ $stage -le 2 ]; then + utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \ + data/local/dict_${affix}_nosp $oov_symbol data/local/lang_${affix}_nosp data/lang_${affix}_nosp || exit 1; + + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + $data data/lang_${affix}_nosp exp/tri2 exp/tri2_ali_${affix}_nosp || exit 1; + + steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_${affix}_nosp exp/tri2_ali_${affix}_nosp || exit 1; + + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_${affix}_nosp exp/tri2_ali_${affix}_nosp/pron_counts_nowb.txt \ + exp/tri2_ali_${affix}_nosp/sil_counts_nowb.txt \ + exp/tri2_ali_${affix}_nosp/pron_bigram_counts_nowb.txt data/local/dict_${affix} || exit 1; + + utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \ + data/local/dict_${affix} $oov_symbol data/local/lang_${affix} data/lang_${affix} || exit 1; +fi + +# Re-decode +if [ $stage -le 3 ]; then + ! cmp data/lang_nosp/words.txt data/lang_${affix}/words.txt &&\ + echo "$0: The vocab of the affix lexicon and the reference vocab may be incompatible." + cp data/lang_nosp/G.fst data/lang_${affix}/ + utils/mkgraph.sh data/lang_${affix} exp/tri3 exp/tri3/graph_${affix} || exit 1; + + for dset in dev test; do + ( steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph_${affix} data/${dset} exp/tri3/decode_${affix}_${dset} || exit 1; + ) & + done +fi + +# RESULTS: +# Baseline: +# %WER 18.7 | 507 17783 | 83.9 11.4 4.7 2.6 18.7 92.3 | -0.006 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys +# %WER 17.6 | 1155 27500 | 84.7 11.6 3.7 2.4 17.6 87.2 | 0.013 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys + +# Re-decoding with the learned lexicon: +# %WER 18.5 | 507 17783 | 84.3 11.2 4.5 2.8 18.5 92.3 | -0.007 | exp/tri3/decode_learned_greedy_dev/score_16_0.0/ctm.filt.filt.sys +# %WER 17.5 | 1155 27500 | 84.9 11.5 3.6 2.4 17.5 87.5 | 0.035 | exp/tri3/decode_learned_greedy_test/score_14_0.0/ctm.filt.filt.sys + +# To see the effect to neural-net results, one should re-train NN with the learned lexicon. +# Experiments have shown that, with the new lang dir, one should just re-run NN training +# starting from the supervision generation (steps/align_fmllr_lats.sh) stage, and should +# expect improved overall WERs and word recognition performance on words whose pronunciations +# were changed. + +exit +wait diff --git a/egs/tedlium/s5_r2_wsj/local/lm/merge_word_counts.py b/egs/tedlium/s5_r2_wsj/local/lm/merge_word_counts.py index 6338cbbf875..85e15d8dc07 100755 --- a/egs/tedlium/s5_r2_wsj/local/lm/merge_word_counts.py +++ b/egs/tedlium/s5_r2_wsj/local/lm/merge_word_counts.py @@ -7,6 +7,7 @@ A min-count argument is required to only write counts that are above the specified minimum count. """ +from __future__ import print_function import sys @@ -21,7 +22,7 @@ def main(): parts = line.strip().split() words[parts[1]] = words.get(parts[1], 0) + int(parts[0]) - for word, count in words.iteritems(): + for word, count in words.items(): if count >= int(sys.argv[1]): print ("{0} {1}".format(count, word)) diff --git a/egs/tedlium/s5_r3/local/join_suffix.py b/egs/tedlium/s5_r3/local/join_suffix.py index 64c62964331..c36b96a07f9 100755 --- a/egs/tedlium/s5_r3/local/join_suffix.py +++ b/egs/tedlium/s5_r3/local/join_suffix.py @@ -5,6 +5,7 @@ # Apache 2.0 +from __future__ import print_function import sys from codecs import open diff --git a/egs/thchs30/s5/local/dae/add-noise-mod.py b/egs/thchs30/s5/local/dae/add-noise-mod.py index 8327fc325ee..4486fd0fdc7 100755 --- a/egs/thchs30/s5/local/dae/add-noise-mod.py +++ b/egs/thchs30/s5/local/dae/add-noise-mod.py @@ -3,6 +3,7 @@ from __future__ import print_function +from __future__ import division import optparse import random import bisect @@ -26,7 +27,7 @@ def energy(mat): def mix(mat, noise, pos, scale): ret = [] l = len(noise) - for i in xrange(len(mat)): + for i in range(len(mat)): x = mat[i] d = int(x + scale * noise[pos]) #if d > 32767 or d < -32768: @@ -41,8 +42,8 @@ def mix(mat, noise, pos, scale): def dirichlet(params): samples = [random.gammavariate(x, 1) if x > 0 else 0. for x in params] - samples = [x / sum(samples) for x in samples] - for x in xrange(1, len(samples)): + samples = [(x / sum(samples)) for x in samples] + for x in range(1, len(samples)): samples[x] += samples[x - 1] return bisect.bisect_left(samples, random.random()) @@ -125,7 +126,7 @@ def main(): mat = wave_mat(wav) signal = energy(mat) logging.debug('signal energy: %f', signal) - noise = signal / (10 ** (noise_level / 10.)) + noise = signal / (10 ** (noise_level / 10)) logging.debug('noise energy: %f', noise) type = dirichlet(params) logging.debug('selected type: %d', type) diff --git a/egs/tunisian_msa/s5/local/buckwalter2unicode.py b/egs/tunisian_msa/s5/local/buckwalter2unicode.py index 94fec3225dd..f81841261ce 100755 --- a/egs/tunisian_msa/s5/local/buckwalter2unicode.py +++ b/egs/tunisian_msa/s5/local/buckwalter2unicode.py @@ -27,6 +27,7 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # +from __future__ import print_function import sys, getopt, codecs, os, re # Declare a dictionary with Buckwalter's ASCII symbols as the keys, and @@ -87,7 +88,7 @@ uni2buck = {} # Iterate through all the items in the buck2uni dict. -for (key, value) in buck2uni.iteritems(): +for (key, value) in buck2uni.items(): # The value from buck2uni becomes a key in uni2buck, and vice # versa for the keys. uni2buck[value] = key @@ -108,103 +109,103 @@ # A function to print to screen the usage details of this script. def usage(): - print "Usage:", sys.argv[0], "-i INFILE -o OUTFILE [-g CHARS -c RANGE -d CHAR" - print " -r -e INPUT_ENCODING, -E OUTPUT ENCODING]" - print " ", sys.argv[0], "-l" - print " ", sys.argv[0], "-h" - print "" - print " -i INFILE, --input=INFILE:" - print " Path to text file to be transliterated to Unicode." - print " -o OUTFILE, --output=OUTFILE:" - print " Path of file to output the newly transliterated text." - print " -e ENC, --input-encoding=ENC:" - print " Specify the text encoding of the source file. Default: latin_1." - print " -E ENC, --output-encoding=ENC:" - print " Specify the text encoding of the target file. Default: utf_8." - print " -g CHARS, --ignore-lines=CHARS:" - print " Will not transliterate lines that start with any of the CHARS" - print " given. E.g., -g #; will not alter lines starting with # or ;." - print " (May need to be -g \#\; on some platforms. See README.txt.)" - print " -c RANGE, --columns=RANGE:" - print " If in columns, select columns to apply transliteration. Can be" - print " comma separated numbers, or a range. E.g., -c 1, -c 1-3, -c 1,3." - print " -d CHAR, --delimiter=CHAR:" - print " Specify the delimiter that defines the column if using the -c" - print " option above. Default is ' ' (space)." - print " -r, --reverse:" - print " Reverses the transliteration, i.e., Arabic to Buckwalter." - print " When used, it will change the default input encoding to utf_8 and" - print " output encoding to latin_1" - print " -l, --list-encodings:" - print " Displays all supported file encodings." - print " -h, --help:" - print " Displays this page." - print "" + print("Usage: {} -i INFILE -o OUTFILE [-g CHARS -c RANGE -d CHAR".format(sys.argv[0])) + print(" -r -e INPUT_ENCODING, -E OUTPUT ENCODING]") + print(" {} -l".format(sys.argv[0])) + print(" {} -h".format(sys.argv[0])) + print("") + print(" -i INFILE, --input=INFILE:") + print(" Path to text file to be transliterated to Unicode.") + print(" -o OUTFILE, --output=OUTFILE:") + print(" Path of file to output the newly transliterated text.") + print(" -e ENC, --input-encoding=ENC:") + print(" Specify the text encoding of the source file. Default: latin_1.") + print(" -E ENC, --output-encoding=ENC:") + print(" Specify the text encoding of the target file. Default: utf_8.") + print(" -g CHARS, --ignore-lines=CHARS:") + print(" Will not transliterate lines that start with any of the CHARS") + print(" given. E.g., -g #; will not alter lines starting with # or ;.") + print(" (May need to be -g \#\; on some platforms. See README.txt.)") + print(" -c RANGE, --columns=RANGE:") + print(" If in columns, select columns to apply transliteration. Can be") + print(" comma separated numbers, or a range. E.g., -c 1, -c 1-3, -c 1,3.") + print(" -d CHAR, --delimiter=CHAR:") + print(" Specify the delimiter that defines the column if using the -c") + print(" option above. Default is ' ' (space).") + print(" -r, --reverse:") + print(" Reverses the transliteration, i.e., Arabic to Buckwalter.") + print(" When used, it will change the default input encoding to utf_8 and") + print(" output encoding to latin_1") + print(" -l, --list-encodings:") + print(" Displays all supported file encodings.") + print(" -h, --help:") + print(" Displays this page.") + print("") # A function to print to screen all the available encodings supported by # Python. def displayEncodings(): - print "Codec Aliases Languages" - print "ascii 646, us-ascii English" - print "cp037 IBM037, IBM039 English" - print "cp424 EBCDIC-CP-HE, IBM424 Hebrew" - print "cp437 437, IBM437 English" - print "cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe" - print "cp737 Greek" - print "cp775 IBM775 Baltic languages" - print "cp850 850, IBM850 Western Europe" - print "cp852 852, IBM852 Central and Eastern Europe" - print "cp855 855, IBM855 Bulgarian, Byelorussian, Macedonian, Russian, Serbian" - print "cp856 Hebrew" - print "cp857 857, IBM857 Turkish" - print "cp860 860, IBM860 Portuguese" - print "cp861 861, CP-IS, IBM861 Icelandic" - print "cp862 862, IBM862 Hebrew" - print "cp863 863, IBM863 Canadian" - print "cp864 IBM864 Arabic" - print "cp865 865, IBM865 Danish, Norwegian" - print "cp869 869, CP-GR, IBM869 Greek" - print "cp874 Thai" - print "cp875 Greek" - print "cp1006 Urdu" - print "cp1026 ibm1026 Turkish" - print "cp1140 ibm1140 Western Europe" - print "cp1250 windows-1250 Central and Eastern Europe" - print "cp1251 windows-1251 Bulgarian, Byelorussian, Macedonian, Russian, Serbian" - print "cp1252 windows-1252 Western Europe" - print "cp1253 windows-1253 Greek" - print "cp1254 windows-1254 Turkish" - print "cp1255 windows-1255 Hebrew" - print "cp1256 windows-1256 Arabic" - print "cp1257 windows-1257 Baltic languages" - print "cp1258 windows-1258 Vietnamese" - print "latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 West Europe" - print "iso8859_2 iso-8859-2, latin2, L2 Central and Eastern Europe" - print "iso8859_3 iso-8859-3, latin3, L3 Esperanto, Maltese" - print "iso8859_4 iso-8859-4, latin4, L4 Baltic languagues" - print "iso8859_5 iso-8859-5, cyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian" - print "iso8859_6 iso-8859-6, arabic Arabic" - print "iso8859_7 iso-8859-7, greek, greek8 Greek" - print "iso8859_8 iso-8859-8, hebrew Hebrew" - print "iso8859_9 iso-8859-9, latin5, L5 Turkish" - print "iso8859_10 iso-8859-10, latin6, L6 Nordic languages" - print "iso8859_13 iso-8859-13 Baltic languages" - print "iso8859_14 iso-8859-14, latin8, L8 Celtic languages" - print "iso8859_15 iso-8859-15 Western Europe" - print "koi8_r Russian" - print "koi8_u Ukrainian" - print "mac_cyrillic maccyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian" - print "mac_greek macgreek Greek" - print "mac_iceland maciceland Icelandic" - print "mac_latin2 maclatin2, maccentraleurope Central and Eastern Europe" - print "mac_roman macroman Western Europe" - print "mac_turkish macturkish Turkish" - print "utf_16 U16, utf16 all languages" - print "utf_16_be UTF-16BE all languages (BMP only)" - print "utf_16_le UTF-16LE all languages (BMP only)" - print "utf_7 U7 all languages" - print "utf_8 U8, UTF, utf8 all languages" + print("Codec Aliases Languages") + print("ascii 646, us-ascii English") + print("cp037 IBM037, IBM039 English") + print("cp424 EBCDIC-CP-HE, IBM424 Hebrew") + print("cp437 437, IBM437 English") + print("cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe") + print("cp737 Greek") + print("cp775 IBM775 Baltic languages") + print("cp850 850, IBM850 Western Europe") + print("cp852 852, IBM852 Central and Eastern Europe") + print("cp855 855, IBM855 Bulgarian, Byelorussian, Macedonian, Russian, Serbian") + print("cp856 Hebrew") + print("cp857 857, IBM857 Turkish") + print("cp860 860, IBM860 Portuguese") + print("cp861 861, CP-IS, IBM861 Icelandic") + print("cp862 862, IBM862 Hebrew") + print("cp863 863, IBM863 Canadian") + print("cp864 IBM864 Arabic") + print("cp865 865, IBM865 Danish, Norwegian") + print("cp869 869, CP-GR, IBM869 Greek") + print("cp874 Thai") + print("cp875 Greek") + print("cp1006 Urdu") + print("cp1026 ibm1026 Turkish") + print("cp1140 ibm1140 Western Europe") + print("cp1250 windows-1250 Central and Eastern Europe") + print("cp1251 windows-1251 Bulgarian, Byelorussian, Macedonian, Russian, Serbian") + print("cp1252 windows-1252 Western Europe") + print("cp1253 windows-1253 Greek") + print("cp1254 windows-1254 Turkish") + print("cp1255 windows-1255 Hebrew") + print("cp1256 windows-1256 Arabic") + print("cp1257 windows-1257 Baltic languages") + print("cp1258 windows-1258 Vietnamese") + print("latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 West Europe") + print("iso8859_2 iso-8859-2, latin2, L2 Central and Eastern Europe") + print("iso8859_3 iso-8859-3, latin3, L3 Esperanto, Maltese") + print("iso8859_4 iso-8859-4, latin4, L4 Baltic languagues") + print("iso8859_5 iso-8859-5, cyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian") + print("iso8859_6 iso-8859-6, arabic Arabic") + print("iso8859_7 iso-8859-7, greek, greek8 Greek") + print("iso8859_8 iso-8859-8, hebrew Hebrew") + print("iso8859_9 iso-8859-9, latin5, L5 Turkish") + print("iso8859_10 iso-8859-10, latin6, L6 Nordic languages") + print("iso8859_13 iso-8859-13 Baltic languages") + print("iso8859_14 iso-8859-14, latin8, L8 Celtic languages") + print("iso8859_15 iso-8859-15 Western Europe") + print("koi8_r Russian") + print("koi8_u Ukrainian") + print("mac_cyrillic maccyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian") + print("mac_greek macgreek Greek") + print("mac_iceland maciceland Icelandic") + print("mac_latin2 maclatin2, maccentraleurope Central and Eastern Europe") + print("mac_roman macroman Western Europe") + print("mac_turkish macturkish Turkish") + print("utf_16 U16, utf16 all languages") + print("utf_16_be UTF-16BE all languages (BMP only)") + print("utf_16_le UTF-16LE all languages (BMP only)") + print("utf_7 U7 all languages") + print("utf_8 U8, UTF, utf8 all languages") def parseIgnoreString(string): @@ -254,13 +255,13 @@ def parseIgnoreString(string): delimiter = delimiter.replace("\\t", "\t") # Do some error checking if len(delimiter) > 1: - print >>sys.stderr, "Delimeter should only be a single character. Using first character" + delimiter[0] + print("Delimeter should only be a single character. Using first character" + delimiter[0], file=sys.stderr) delimiter = delimiter[0] if buck2uni.get(delimiter): - print >> sys.stderr, "Invalid delimiter. \"" + delimiter + "\" is part of the Buckwalter character set." - print >> sys.stderr, "This will obviously cause much confusion as a delimiter!" - print >> sys.stderr, "Please try again. Aborting..." + print("Invalid delimiter. \"" + delimiter + "\" is part of the Buckwalter character set.", file=sys.stderr) + print("This will obviously cause much confusion as a delimiter!", file=sys.stderr) + print("Please try again. Aborting...", file=sys.stderr) sys.exit(1) # If no delimiter was set then, set the default to " " (space) @@ -303,16 +304,16 @@ def parseIgnoreString(string): # specified output encoding. outFile = codecs.open(outFilename, "w", outEnc) - except IOError, msg: + except IOError as msg: # A problem occurred when trying to open this file. Report to # user... - print msg + print(msg) sys.exit(1) # Script can not work without somewhere to store the transliteration. # Exit. else: - print "Must specify a file to use store the output! Aborting..." + print("Must specify a file to use store the output! Aborting...") sys.exit(1) # Providing a file for input was specified... @@ -322,15 +323,15 @@ def parseIgnoreString(string): # specified input encoding. inFile = codecs.open(inFilename, "r", inEnc) - except IOError, msg: + except IOError as msg: # A problem occurred when trying to open this file. Report to # user... - print msg + print(msg) sys.exit(1) # This script requires a file to read from. Exit. else: - print "Must specify a file to use as input! Aborting..." + print("Must specify a file to use as input! Aborting...") sys.exit(1) def getColsFromRange(cRange): @@ -344,7 +345,7 @@ def getColsFromRange(cRange): # If it contains a hyphen (e.g., 1-3) if hyphenSearch.search(i): [start, end] = i.split("-") - columns = columns + range(int(start)-1,int(end)) + columns = columns + list(range(int(start)-1,int(end))) else: columns.append(int(i)-1) @@ -441,9 +442,9 @@ def transliterateString(inString): currentLineNumber = currentLineNumber + 1 - except UnicodeError, msg: + except UnicodeError as msg: # A problem when writing occurred. Report to user... - print msg + print(msg) sys.exit(1) # All done! Better close the files used before terminating... diff --git a/egs/uw3/v1/local/make_features.py b/egs/uw3/v1/local/make_features.py index dd0a30a19d7..e0211963e39 100755 --- a/egs/uw3/v1/local/make_features.py +++ b/egs/uw3/v1/local/make_features.py @@ -24,8 +24,8 @@ parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") -parser.add_argument('dir', type=str, help='data directory (should contain images.scp)') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file.') +parser.add_argument('dir', help='data directory (should contain images.scp)') +parser.add_argument('--out-ark', default='-', help='where to write the output feature file.') parser.add_argument('--feat-dim', type=int, default=40, help='size to scale the height of all images (i.e. the dimension of the resulting features)') parser.add_argument('--pad', type=bool, default=False, help='pad the left and right of the images with 10 white pixels.') @@ -43,7 +43,7 @@ def write_kaldi_matrix(file_handle, matrix, key): if num_cols != len(matrix[row_index]): raise Exception("All the rows of a matrix are expected to " "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + file_handle.write(" ".join([str(x) for x in matrix[row_index]])) if row_index != num_rows - 1: file_handle.write("\n") file_handle.write(" ]\n") diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py index f5b37b04c2f..3643c0aca89 100755 --- a/egs/uw3/v1/local/process_data.py +++ b/egs/uw3/v1/local/process_data.py @@ -14,8 +14,8 @@ import random parser = argparse.ArgumentParser(description="""Creates data/train and data/test.""") -parser.add_argument('database_path', type=str, help='path to downloaded (and extracted) UW3 corpus') -parser.add_argument('out_dir', type=str, default='data', +parser.add_argument('database_path', help='path to downloaded (and extracted) UW3 corpus') +parser.add_argument('out_dir', default='data', help='where to create the train and test data directories') args = parser.parse_args() @@ -53,9 +53,9 @@ coin = random.randint(0, 20) if coin >= 1: train_text_fh.write(utt_id + ' ' + text + '\n') - train_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n') - train_image_fh.write(utt_id + ' ' + image_path + '\n') + train_utt2spk_fh.write("{} {}\n".format(utt_id, page_count)) + train_image_fh.write("{} {}\n".format(utt_id, image_path) elif coin < 1: - test_text_fh.write(utt_id + ' ' + text + '\n') - test_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n') - test_image_fh.write(utt_id + ' ' + image_path + '\n') + test_text_fh.write("{} {}\n".format(utt_id, text)) + test_utt2spk_fh.write("{} {}\n".format(utt_id, page_count)) + train_image_fh.write("{} {}\n".format(utt_id, image_path) diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1/local/make_musan.py index 74c434990fb..565bfce0cc9 100755 --- a/egs/voxceleb/v1/local/make_musan.py +++ b/egs/voxceleb/v1/local/make_musan.py @@ -47,9 +47,9 @@ def prepare_music(root_dir, use_vocals): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In music directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_speech(root_dir): @@ -73,9 +73,9 @@ def prepare_speech(root_dir): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In speech directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def prepare_noise(root_dir): @@ -99,9 +99,9 @@ def prepare_noise(root_dir): utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" num_good_files += 1 else: - print("Missing file", utt) + print("Missing file {}".format(utt)) num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + print("In noise directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files)) return utt2spk_str, utt2wav_str def main(): diff --git a/egs/voxceleb/v1/local/prepare_for_eer.py b/egs/voxceleb/v1/local/prepare_for_eer.py index 6bfa04e011b..2f569b70bc5 100755 --- a/egs/voxceleb/v1/local/prepare_for_eer.py +++ b/egs/voxceleb/v1/local/prepare_for_eer.py @@ -16,4 +16,4 @@ spkrutt2target[spkr+utt]=target for line in scores: spkr, utt, score = line.strip().split() - print(score, spkrutt2target[spkr+utt]) + print("{} {}".format(score, spkrutt2target[spkr+utt])) diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh index e57799cee27..f8c50d7f9df 100755 --- a/egs/voxceleb/v2/run.sh +++ b/egs/voxceleb/v2/run.sh @@ -27,7 +27,7 @@ stage=0 if [ $stage -le 0 ]; then local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test - # This script reates data/voxceleb1_test and data/voxceleb1_train. + # This script creates data/voxceleb1_test and data/voxceleb1_train. # Our evaluation set is the test portion of VoxCeleb1. local/make_voxceleb1.pl $voxceleb1_root data # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. diff --git a/egs/voxforge/gst_demo/run-live.py b/egs/voxforge/gst_demo/run-live.py index 725a306c42c..7876e5f2046 100755 --- a/egs/voxforge/gst_demo/run-live.py +++ b/egs/voxforge/gst_demo/run-live.py @@ -6,6 +6,7 @@ # # Apache 2.0 +from __future__ import print_function import sys import os import gi @@ -46,7 +47,7 @@ def init_gst(self): """Initialize the speech components""" self.pulsesrc = Gst.ElementFactory.make("pulsesrc", "pulsesrc") if self.pulsesrc == None: - print >> sys.stderr, "Error loading pulsesrc GST plugin. You probably need the gstreamer1.0-pulseaudio package" + print("Error loading pulsesrc GST plugin. You probably need the gstreamer1.0-pulseaudio package", file=sys.stderr) sys.exit() self.audioconvert = Gst.ElementFactory.make("audioconvert", "audioconvert") self.audioresample = Gst.ElementFactory.make("audioresample", "audioresample") @@ -56,7 +57,7 @@ def init_gst(self): if self.asr: model_dir = "online-data/models/tri2b_mmi/" if not os.path.isdir(model_dir): - print >> sys.stderr, "Model (%s) not downloaded. Run run-simulated.sh first" % model_dir + print("Model (%s) not downloaded. Run run-simulated.sh first" % model_dir, file=sys.stderr) sys.exit(1) self.asr.set_property("fst", model_dir + "HCLG.fst") self.asr.set_property("lda-mat", model_dir + "matrix") @@ -67,12 +68,12 @@ def init_gst(self): self.asr.set_property("beam", 12.0) self.asr.set_property("acoustic-scale", 0.0769) else: - print >> sys.stderr, "Couldn't create the onlinegmmfasterdecoder element. " + print("Couldn't create the onlinegmmfasterdecoder element. ", file=sys.stderr) if "GST_PLUGIN_PATH" in os.environ: - print >> sys.stderr, "Have you compiled the Kaldi GStreamer plugin?" + print("Have you compiled the Kaldi GStreamer plugin?", file=sys.stderr) else: - print >> sys.stderr, "You probably need to set the GST_PLUGIN_PATH envoronment variable" - print >> sys.stderr, "Try running: GST_PLUGIN_PATH=../../../src/gst-plugin %s" % sys.argv[0] + print("You probably need to set the GST_PLUGIN_PATH envoronment variable", file=sys.stderr) + print("Try running: GST_PLUGIN_PATH=../../../src/gst-plugin %s" % sys.argv[0], file=sys.stderr) sys.exit(); # initially silence the decoder @@ -111,10 +112,10 @@ def button_clicked(self, button): if __name__ == '__main__': app = DemoApp() - print ''' + print(''' The (bigram) language model used to build the decoding graph was estimated on an audio book's text. The text in question is King Solomon's Mines" (http://www.gutenberg.org/ebooks/2166). - You may want to read some sentences from this book first ...''' + You may want to read some sentences from this book first ...''') Gtk.main() diff --git a/egs/voxforge/s5/local/make_trans.py b/egs/voxforge/s5/local/make_trans.py index 1b4f5c4136a..612755c8be4 100755 --- a/egs/voxforge/s5/local/make_trans.py +++ b/egs/voxforge/s5/local/make_trans.py @@ -12,11 +12,12 @@ if this is the case produces a transcript line for each file in the format: prefix_a0405 IT SEEMED THE ORDAINED ORDER OF THINGS THAT DOGS SHOULD WORK """ +from __future__ import print_function import sys def err(msg): - print >> sys.stderr, msg + print(msg, file=sys.stderr) if len(sys.argv) < 3: err("Usage: %s ... " % sys.argv[0]) @@ -46,5 +47,5 @@ def err(msg): if not uid in utt2trans: err("No transcript found for %s_%s" % (id_prefix, uid)) continue - print "%s-%s %s" % (id_prefix, uid, utt2trans[uid]) + print("%s-%s %s" % (id_prefix, uid, utt2trans[uid])) diff --git a/egs/vystadial_cz/online_demo/build_reference.py b/egs/vystadial_cz/online_demo/build_reference.py index 1be78391d2f..aea12a2c8bc 100755 --- a/egs/vystadial_cz/online_demo/build_reference.py +++ b/egs/vystadial_cz/online_demo/build_reference.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # encoding: utf-8 from __future__ import unicode_literals +from __future__ import print_function import glob import sys @@ -8,7 +9,7 @@ import codecs def build_reference(wav_scp, ref_path): - print wav_scp, ref_path + print(wav_scp, ref_path) with codecs.open(ref_path, 'w', 'utf-8') as w: with codecs.open(wav_scp, 'r', 'utf-8') as scp: for line in scp: @@ -31,8 +32,8 @@ def build_reference(wav_scp, ref_path): usage_args = {'exec': sys.argv[0]} if len(sys.argv) != 3: - print >> sys.stderr, "Wrong number of arguments" - print >> sys.stderr, usage % {'exec': sys.argv[0]} + print("Wrong number of arguments", file=sys.stderr) + print(usage % {'exec': sys.argv[0]}, file=sys.stderr) sys.exit(1) if sys.argv[1].endswith('scp'): @@ -41,12 +42,12 @@ def build_reference(wav_scp, ref_path): scps = glob.glob(os.path.join(sys.argv[1], '*.scp')) target_dir = sys.argv[2] if not len(scps): - print >> sys.stderr, "No '*.scp' files found" - print >> sys.stderr, usage % {'exec': sys.argv[0]} + print("No '*.scp' files found", file=sys.stderr) + print(usage % {'exec': sys.argv[0]}, file=sys.stderr) sys.exit(1) if not os.path.isdir(target_dir): - print >> sys.stderr, "No '*.scp' files found" - print >> sys.stderr, usage % {'exec': sys.argv[0]} + print("No '*.scp' files found", file=sys.stderr) + print(usage % {'exec': sys.argv[0]}, file=sys.stderr) sys.exit(1) refers = [os.path.join(target_dir, os.path.basename(scp) + '.tra') for scp in scps] diff --git a/egs/vystadial_cz/online_demo/live-demo.py b/egs/vystadial_cz/online_demo/live-demo.py index 6b41c12c739..320a930735f 100755 --- a/egs/vystadial_cz/online_demo/live-demo.py +++ b/egs/vystadial_cz/online_demo/live-demo.py @@ -15,6 +15,7 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # from __future__ import unicode_literals +from __future__ import print_function import pyaudio from kaldi.decoders import PyOnlineLatgenRecogniser @@ -29,7 +30,7 @@ CHANNELS, RATE, FORMAT = 1, 16000, pyaudio.paInt16 -class LiveDemo: +class LiveDemo(object): def __init__(self, audio_batch_size, wst, dec_args): self.batch_size = audio_batch_size @@ -127,7 +128,7 @@ def save_wav(self): if __name__ == '__main__': audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2] argv = sys.argv[3:] - print >> sys.stderr, 'Python args: %s' % str(sys.argv) + print('Python args: %s' % str(sys.argv), file=sys.stderr) wst = wst2dict(wst_path) demo = LiveDemo(audio_batch_size, wst, argv) diff --git a/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py b/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py index 02a0400921c..0008a4c01f1 100755 --- a/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py +++ b/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py @@ -14,6 +14,8 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # from __future__ import unicode_literals +from __future__ import division +from __future__ import print_function from kaldi.utils import load_wav, wst2dict, lattice_to_nbest from kaldi.decoders import PyOnlineLatgenRecogniser @@ -31,14 +33,14 @@ def write_decoded(f, wav_name, word_ids, wst): if wst is not None: decoded = [wst[w] for w in best_path] else: - decoded = [unicode(w) for w in best_path] + decoded = [str(w) for w in best_path] line = u' '.join([wav_name] + decoded + ['\n']) if DEBUG: - print '%s best path %s' % (wav_name, decoded.encode('UTF-8')) + print('%s best path %s' % (wav_name, decoded.encode('UTF-8'))) for i, s in enumerate(word_ids): if i > 0: break - print 'best path %d: %s' % (i, str(s)) + print('best path %d: %s' % (i, str(s))) f.write(line.encode('UTF-8')) @@ -55,11 +57,11 @@ def decode(d, pcm): while dec_t > 0: decoded_frames += dec_t dec_t = d.decode(max_frames=10) - print "forward decode: %s secs" % str(time.time() - start) + print("forward decode: %s secs" % str(time.time() - start)) start = time.time() d.prune_final() lik, lat = d.get_lattice() - print "backward decode: %s secs" % str(time.time() - start) + print("backward decode: %s secs" % str(time.time() - start)) d.reset(keep_buffer_data=False) return (lat, lik, decoded_frames) @@ -72,7 +74,7 @@ def decode_wrap(argv, audio_batch_size, wav_paths, for wav_name, wav_path in wav_paths: sw, sr = 2, 16000 # 16-bit audio so 1 sample_width = 2 chars pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr) - print '%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr) + print('%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr)) lat, lik, decoded_frames = decode(d, pcm) lat.isyms = lat.osyms = fst.read_symbols_text(wst_path) if DEBUG: @@ -80,8 +82,8 @@ def decode_wrap(argv, audio_batch_size, wav_paths, f.write(lat._repr_svg_()) lat.write('%s_pykaldi.fst' % wav_name) - print "Log-likelihood per frame for utterance %s is %f over %d frames" % ( - wav_name, (lik / decoded_frames), decoded_frames) + print("Log-likelihood per frame for utterance %s is %f over %d frames" % ( + wav_name, int(lik / decoded_frames), decoded_frames)) word_ids = lattice_to_nbest(lat, n=10) write_decoded(file_output, wav_name, word_ids, wst) @@ -90,7 +92,7 @@ def decode_wrap(argv, audio_batch_size, wav_paths, audio_scp, audio_batch_size = sys.argv[1], int(sys.argv[2]) dec_hypo, wst_path = sys.argv[3], sys.argv[4] argv = sys.argv[5:] - print >> sys.stderr, 'Python args: %s' % str(sys.argv) + print('Python args: %s' % str(sys.argv), file=sys.stderr) # open audio_scp, decode and write to dec_hypo file with open(audio_scp, 'rb') as r: diff --git a/egs/vystadial_cz/s5/local/results.py b/egs/vystadial_cz/s5/local/results.py index a7c19af214c..f37109d5fcb 100755 --- a/egs/vystadial_cz/s5/local/results.py +++ b/egs/vystadial_cz/s5/local/results.py @@ -14,6 +14,8 @@ # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # +from __future__ import division +from __future__ import print_function import argparse import glob import sys @@ -29,8 +31,8 @@ def extract_stat(wer_file): ser = float(s[2].split()[1]) except Exception as e: - print sys.stderr, 'Error parsing file %s' % wer_file - print sys.stderr, str(e) + print(sys.stderr, 'Error parsing file %s' % wer_file) + print(sys.stderr, str(e)) return wer, ser @@ -47,8 +49,8 @@ def extractResults(path): wer, ser = extract_stat(wf) table.append((exp, dataset, lm, lm_w, wer, ser)) except Exception as e: - print >> sys.stderr, 'failed to parse %s' % wf - print >> sys.stderr, str(e) + print('failed to parse %s' % wf, file=sys.stderr) + print(str(e), file=sys.stderr) return table @@ -105,7 +107,7 @@ def Table2LatexTable(table): def createSmallTable(r): d = [] - for k, v in r.iteritems(): + for k, v in r.items(): w, s, r = v if w == []: minw = None @@ -115,7 +117,7 @@ def createSmallTable(r): mins = None else: mins = min(s) # returns tuple if s is list of tuples - mean_r = sum(r) / float(len(r)) + mean_r = float(sum(r)) / len(r) d.append([k, mean_r, minw, mins]) t = Table(d, ['exp', 'RT coef', 'WER', 'SER']) return t @@ -167,7 +169,7 @@ def createSmallTable(r): # remove duplicates: duplicates if equal mimimum wer in dev set min_dev_un = [(e, lm, lmw) for ((e, lm), lmw) in - dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items()] + list(dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items())] # sort according LM -> sort results according experiment & LMs min_dev_un.sort(key=lambda x: (x[1], x[0])) @@ -182,6 +184,6 @@ def createSmallTable(r): d.append(x[0]) t = Table(data=d, colnames=['exp', 'set', 'LM', 'LMW', 'WER', 'SER']) - print str(t) + print(str(t)) if args.latex: - print Table2LatexTable(t) + print(Table2LatexTable(t)) diff --git a/egs/vystadial_en/s5/local/results.py b/egs/vystadial_en/s5/local/results.py index a7c19af214c..f37109d5fcb 100755 --- a/egs/vystadial_en/s5/local/results.py +++ b/egs/vystadial_en/s5/local/results.py @@ -14,6 +14,8 @@ # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # +from __future__ import division +from __future__ import print_function import argparse import glob import sys @@ -29,8 +31,8 @@ def extract_stat(wer_file): ser = float(s[2].split()[1]) except Exception as e: - print sys.stderr, 'Error parsing file %s' % wer_file - print sys.stderr, str(e) + print(sys.stderr, 'Error parsing file %s' % wer_file) + print(sys.stderr, str(e)) return wer, ser @@ -47,8 +49,8 @@ def extractResults(path): wer, ser = extract_stat(wf) table.append((exp, dataset, lm, lm_w, wer, ser)) except Exception as e: - print >> sys.stderr, 'failed to parse %s' % wf - print >> sys.stderr, str(e) + print('failed to parse %s' % wf, file=sys.stderr) + print(str(e), file=sys.stderr) return table @@ -105,7 +107,7 @@ def Table2LatexTable(table): def createSmallTable(r): d = [] - for k, v in r.iteritems(): + for k, v in r.items(): w, s, r = v if w == []: minw = None @@ -115,7 +117,7 @@ def createSmallTable(r): mins = None else: mins = min(s) # returns tuple if s is list of tuples - mean_r = sum(r) / float(len(r)) + mean_r = float(sum(r)) / len(r) d.append([k, mean_r, minw, mins]) t = Table(d, ['exp', 'RT coef', 'WER', 'SER']) return t @@ -167,7 +169,7 @@ def createSmallTable(r): # remove duplicates: duplicates if equal mimimum wer in dev set min_dev_un = [(e, lm, lmw) for ((e, lm), lmw) in - dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items()] + list(dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items())] # sort according LM -> sort results according experiment & LMs min_dev_un.sort(key=lambda x: (x[1], x[0])) @@ -182,6 +184,6 @@ def createSmallTable(r): d.append(x[0]) t = Table(data=d, colnames=['exp', 'set', 'LM', 'LMW', 'WER', 'SER']) - print str(t) + print(str(t)) if args.latex: - print Table2LatexTable(t) + print(Table2LatexTable(t)) diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh index 1724c057e12..526059b7b90 100755 --- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh @@ -220,6 +220,7 @@ if [ $stage -le 16 ]; then --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=2000" \ --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=10 \ diff --git a/egs/wsj/s5/steps/cleanup/combine_short_segments.py b/egs/wsj/s5/steps/cleanup/combine_short_segments.py index 1d14bd2a57f..099b92882a9 100755 --- a/egs/wsj/s5/steps/cleanup/combine_short_segments.py +++ b/egs/wsj/s5/steps/cleanup/combine_short_segments.py @@ -284,7 +284,7 @@ def CombineSegments(input_dir, output_dir, minimum_duration): assert(cur_utt_dur == combined_duration) # now modify the utts list - combined_indices = range(left_index, right_index + 1) + combined_indices = list(range(left_index, right_index + 1)) # start popping from the largest index so that the lower # indexes are valid for i in combined_indices[::-1]: diff --git a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py index 414875f9013..a33ba85d9fa 100755 --- a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py +++ b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py @@ -4,6 +4,7 @@ # Apache 2.0. from __future__ import print_function +from __future__ import division import argparse import sys import warnings @@ -211,7 +212,7 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron return stats def WriteStats(stats, file_handle): - for word_pron, count in stats.iteritems(): + for word_pron, count in stats.items(): print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle) file_handle.close() diff --git a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py index f37fa866b0f..e41a67705e9 100755 --- a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py +++ b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py @@ -4,6 +4,7 @@ # Apache 2.0. from __future__ import print_function +from __future__ import division import sys import argparse import math @@ -47,7 +48,7 @@ -class NgramCounts: +class NgramCounts(object): ## A note on data-structure. ## Firstly, all words are represented as integers. ## We store n-gram counts as an array, indexed by (history-length == n-gram order minus one) @@ -139,7 +140,7 @@ def GetHistToTotalCount(self): # LM-states that would back off to 'this' lm-state, in the total. def CompletelyDiscountLowCountStates(self, min_count): hist_to_total_count = self.GetHistToTotalCount() - for n in reversed(range(2, self.ngram_order)): + for n in reversed(list(range(2, self.ngram_order))): this_order_counts = self.counts[n] for hist in this_order_counts.keys(): if hist_to_total_count[hist] < min_count: @@ -156,7 +157,7 @@ def CompletelyDiscountLowCountStates(self, min_count): # with interpolation). def ApplyBackoff(self, D): assert D > 0.0 and D < 1.0 - for n in reversed(range(1, self.ngram_order)): + for n in reversed(list(range(1, self.ngram_order))): this_order_counts = self.counts[n] for hist, word_to_count in this_order_counts.items(): backoff_hist = hist[1:] @@ -182,7 +183,7 @@ def Print(self, info_string): for this_order_counts in self.counts: for hist, word_to_count in this_order_counts.items(): this_total_count = sum(word_to_count.values()) - print(str(hist) + ': total={0} '.format(this_total_count), + print('{0}: total={1} '.format(hist, this_total_count), end='', file=sys.stderr) print(' '.join(['{0} -> {1} '.format(word, count) for word, count in word_to_count.items() ]), @@ -242,10 +243,10 @@ def GetHistToStateMap(self): def GetProb(self, hist, word, total_count_map): total_count = total_count_map[hist] word_to_count = self.counts[len(hist)][hist] - prob = word_to_count[word] / total_count + prob = float(word_to_count[word]) / total_count if len(hist) > 0 and word != self.backoff_symbol: prob_in_backoff = self.GetProb(hist[1:], word, total_count_map) - backoff_prob = word_to_count[self.backoff_symbol] / total_count + backoff_prob = float(word_to_count[self.backoff_symbol]) / total_count prob += backoff_prob * prob_in_backoff return prob @@ -262,7 +263,7 @@ def PrintAsFst(self, word_disambig_symbol): hist_to_state = self.GetHistToStateMap() total_count_map = self.GetTotalCountMap() - for n in [ 1, 0 ] + range(2, self.ngram_order): + for n in [ 1, 0 ] + list(range(2, self.ngram_order)): this_order_counts = self.counts[n] # For order 1, make sure the keys are sorted. keys = this_order_counts.keys() if n != 1 else sorted(this_order_counts.keys()) diff --git a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py index ad03b557bfe..1dae735304f 100755 --- a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py +++ b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py @@ -15,6 +15,7 @@ """ from __future__ import print_function +from __future__ import division import argparse import collections import logging @@ -228,7 +229,7 @@ def resolve_overlaps(ctm_edits, segments): try: cur_utt_end_index = next( (i for i, line in enumerate(ctm_edits_for_cur_utt) - if line[2] + line[3] / 2.0 > window_length - overlap)) + if line[2] + line[3] / 2.0)> window_length - overlap)) except StopIteration: cur_utt_end_index = len(ctm_edits_for_cur_utt) @@ -299,7 +300,7 @@ def run(args): segments, reco2utt = read_segments(args.segments) ctm_edits = read_ctm_edits(args.ctm_edits_in, segments) - for reco, utts in reco2utt.iteritems(): + for reco, utts in reco2utt.items(): ctm_edits_for_reco = [] for utt in sorted(utts, key=lambda x: segments[x][1]): if (reco, utt) in ctm_edits: diff --git a/egs/wsj/s5/steps/cleanup/internal/retrieve_similar_docs.py b/egs/wsj/s5/steps/cleanup/internal/retrieve_similar_docs.py index eb0b18f0408..9594d2ecc60 100755 --- a/egs/wsj/s5/steps/cleanup/internal/retrieve_similar_docs.py +++ b/egs/wsj/s5/steps/cleanup/internal/retrieve_similar_docs.py @@ -223,7 +223,7 @@ def read_map(file_handle, num_values_per_key=None, def get_document_ids(source_docs, indexes): indexes = sorted( - [(key, value[0], value[1]) for key, value in indexes.iteritems()], + [(key, value[0], value[1]) for key, value in indexes.items()], key=lambda x: x[0]) doc_ids = [] @@ -273,7 +273,7 @@ def run(args): "Did not get scores for query {0}".format(query_id)) if args.verbose > 2: - for tup, score in scores.iteritems(): + for tup, score in scores.items(): logger.debug("Score, {num}: {0} {1} {2}".format( tup[0], tup[1], score, num=num_queries)) diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py index 39f6d38d6bf..39d6cb6ed80 100755 --- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py @@ -5,6 +5,7 @@ # Apache 2.0 from __future__ import print_function +from __future__ import division import sys, operator, argparse, os from collections import defaultdict @@ -171,7 +172,7 @@ def ComputeSegmentCores(split_lines_of_utt): return segment_ranges -class Segment: +class Segment(object): def __init__(self, split_lines_of_utt, start_index, end_index, debug_str = None): self.split_lines_of_utt = split_lines_of_utt # start_index is the index of the first line that appears in this @@ -551,7 +552,7 @@ def PossiblyTruncateStartForJunkProportion(self): if candidate_start_index is None: return # Nothing to do as there is no place to split. candidate_removed_piece_duration = candidate_start_time - self.StartTime() - if begin_junk_duration / candidate_removed_piece_duration < args.max_junk_proportion: + if float(begin_junk_duration) / candidate_removed_piece_duration < args.max_junk_proportion: return # Nothing to do as the candidate piece to remove has too # little junk. # OK, remove the piece. @@ -593,7 +594,7 @@ def PossiblyTruncateEndForJunkProportion(self): if candidate_end_index is None: return # Nothing to do as there is no place to split. candidate_removed_piece_duration = self.EndTime() - candidate_end_time - if end_junk_duration / candidate_removed_piece_duration < args.max_junk_proportion: + if float(end_junk_duration) / candidate_removed_piece_duration < args.max_junk_proportion: return # Nothing to do as the candidate piece to remove has too # little junk. # OK, remove the piece. @@ -807,7 +808,7 @@ def TimeToString(time, frame_length): def WriteSegmentsForUtterance(text_output_handle, segments_output_handle, old_utterance_name, segments): - num_digits = len(str(len(segments))) + num_digits = len('{}'.format(len(segments))) for n in range(len(segments)): segment = segments[n] # split utterances will be named foo-bar-1 foo-bar-2, etc. @@ -840,24 +841,24 @@ def PrintDebugInfoForUtterance(ctm_edits_out_handle, info_to_print = [] for n in range(len(segments_for_utterance)): segment = segments_for_utterance[n] - start_string = 'start-segment-' + str(n+1) + '[' + segment.DebugInfo() + ']' + start_string = 'start-segment-{0}[{1}]'.format(n+1, segment.DebugInfo()) info_to_print.append( (segment.StartTime(), start_string) ) - end_string = 'end-segment-' + str(n+1) + end_string = 'end-segment-{}'.format(n+1) info_to_print.append( (segment.EndTime(), end_string) ) # for segments that were deleted we print info like start-deleted-segment-1, and # otherwise similar info to segments that were retained. for n in range(len(deleted_segments_for_utterance)): segment = deleted_segments_for_utterance[n] - start_string = 'start-deleted-segment-' + str(n+1) + '[' + segment.DebugInfo() + ']' + start_string = 'start-deleted-segment-{0}[{1}]'.format(n+1, segment.DebugInfo()) info_to_print.append( (segment.StartTime(), start_string) ) - end_string = 'end-deleted-segment-' + str(n+1) + end_string = 'end-deleted-segment-{}'.format(n+1) info_to_print.append( (segment.EndTime(), end_string) ) info_to_print = sorted(info_to_print) for i in range(len(split_lines_of_cur_utterance)): split_line=split_lines_of_cur_utterance[i] - split_line[0] += '[' + str(i) + ']' # add an index like [0], [1], to + split_line[0] += '[{}]'.format(i) # add an index like [0], [1], to # the utterance-id so we can easily # look up segment indexes. start_time = float(split_line[2]) diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py index 46a9369ae98..9fcc2e89360 100755 --- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py +++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py @@ -5,6 +5,7 @@ # Apache 2.0 from __future__ import print_function +from __future__ import division import argparse import copy import logging @@ -869,8 +870,7 @@ def relax_boundary_truncation(self, min_segment_length, # a * (length_with_truncation - length_with_relaxed_boundaries) # -> a = (length_cutoff - length_with_relaxed_boundaries) # / (length_with_truncation - length_with_relaxed_boundaries) - a = ((length_cutoff - length_with_relaxed_boundaries) - / (length_with_truncation - length_with_relaxed_boundaries)) + a = (length_cutoff - length_with_relaxed_boundaries) / (length_with_truncation - length_with_relaxed_boundaries) if a < 0.0 or a > 1.0: # TODO(vimal): Should this be an error? _global_logger.warn("bad 'a' value = %.4f", a) @@ -1756,7 +1756,7 @@ def time_to_string(time, frame_length): """ Gives time in string form as an exact multiple of the frame-length, e.g. 0.01 (after rounding). """ - n = round(time / frame_length) + n = round(time /frame_length) assert n >= 0 # The next function call will remove trailing zeros while printing it, so # that e.g. 0.01 will be printed as 0.01 and not 0.0099999999999999. It diff --git a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py index 85e1df997a7..4e0e1ae2283 100755 --- a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py @@ -201,7 +201,7 @@ def PrintNonScoredStats(): percent_modified, percent_of_incorrect_modified), file = sys.stderr) - keys = sorted(ref_change_stats.keys(), reverse=True, + keys = sorted(list(ref_change_stats.keys()), reverse=True, key = lambda x: ref_change_stats[x]) num_keys_to_print = 40 if args.verbose >= 2 else 10 @@ -219,7 +219,7 @@ def PrintStats(): return print("taint_ctm_edits.py: processed {0} input lines, whose edit-types were: ".format(tot_lines) + ', '.join([ '%s = %.2f%%' % (k, num_lines_of_type[k] * 100.0 / tot_lines) - for k in sorted(num_lines_of_type.keys(), reverse = True, + for k in sorted(list(num_lines_of_type.keys()), reverse = True, key = lambda k: num_lines_of_type[k]) ]), file = sys.stderr) diff --git a/egs/wsj/s5/steps/cleanup/internal/tf_idf.py b/egs/wsj/s5/steps/cleanup/internal/tf_idf.py index 9b2f4d693a6..a098d9f2a44 100644 --- a/egs/wsj/s5/steps/cleanup/internal/tf_idf.py +++ b/egs/wsj/s5/steps/cleanup/internal/tf_idf.py @@ -6,6 +6,7 @@ """ from __future__ import print_function +from __future__ import division import logging import math import re @@ -51,8 +52,7 @@ def get_inverse_document_frequency(self, term, weighting_scheme="log"): if weighting_scheme == "log-smoothed": return math.log(1.0 + float(self.num_docs) / (1.0 + n_t)) if weighting_scheme == "probabilitic": - return math.log((self.num_docs - n_t - 1) - / (1.0 + n_t)) + return math.log((self.num_docs - n_t - 1) / (1.0 + n_t)) def accumulate(self, term): """Adds one count to the number of docs containing the term "term". @@ -66,7 +66,7 @@ def write(self, file_handle): ... for n-gram (, ... ) """ - for term, num in self.num_docs_for_term.iteritems(): + for term, num in self.num_docs_for_term.items(): if num == 0: continue assert isinstance(term, tuple) @@ -135,7 +135,7 @@ def compute_term_stats(self, idf_stats=None): based on the stored raw counts.""" if len(self.raw_counts) == 0: raise RuntimeError("No (term, doc) found in tf-stats.") - for tup, counts in self.raw_counts.iteritems(): + for tup, counts in self.raw_counts.items(): term = tup[0] if counts > self.max_counts_for_term.get(term, 0): @@ -149,7 +149,7 @@ def __str__(self): ... """ lines = [] - for tup, counts in self.raw_counts.iteritems(): + for tup, counts in self.raw_counts.items(): term, doc = tup lines.append("{order} {term} {doc} {counts}".format( order=len(term), term=" ".join(term), @@ -225,7 +225,7 @@ def compute_similarity_scores(self, source_tfidf, source_docs=None, num_terms_per_doc = {} similarity_scores = {} - for tup, value in self.tf_idf.iteritems(): + for tup, value in self.tf_idf.items(): term, doc = tup num_terms_per_doc[doc] = num_terms_per_doc.get(doc, 0) + 1 @@ -253,19 +253,18 @@ def compute_similarity_scores(self, source_tfidf, source_docs=None, similarity_scores.get((doc, src_doc), 0) + src_value * value) else: - for src_tup, src_value in source_tfidf.tf_idf.iteritems(): + for src_tup, src_value in source_tfidf.tf_idf.items(): similarity_scores[(doc, src_doc)] = ( similarity_scores.get((doc, src_doc), 0) + src_value * value) if do_length_normalization: - for doc_pair, value in similarity_scores.iteritems(): + for doc_pair, value in similarity_scores.items(): doc, src_doc = doc_pair - similarity_scores[(doc, src_doc)] = (value - / num_terms_per_doc[doc]) + similarity_scores[(doc, src_doc)] = value / num_terms_per_doc[doc] if logger.isEnabledFor(logging.DEBUG): - for doc, count in num_terms_per_doc.iteritems(): + for doc, count in num_terms_per_doc.items(): logger.debug( 'Seen {0} terms in query document {1}'.format(count, doc)) @@ -329,7 +328,7 @@ def write(self, tf_idf_file): """Writes TFIDF object to file.""" print ("", file=tf_idf_file) - for tup, value in self.tf_idf.iteritems(): + for tup, value in self.tf_idf.items(): term, doc = tup print("{order} {term} {doc} {tfidf}".format( order=len(term), term=" ".join(term), diff --git a/egs/wsj/s5/steps/conf/append_eval_to_ctm.py b/egs/wsj/s5/steps/conf/append_eval_to_ctm.py index f8e2aad891d..90679d2b341 100755 --- a/egs/wsj/s5/steps/conf/append_eval_to_ctm.py +++ b/egs/wsj/s5/steps/conf/append_eval_to_ctm.py @@ -3,6 +3,7 @@ # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 +from __future__ import print_function import sys,operator # Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM': @@ -15,7 +16,7 @@ # 'U' = unknown (not part of scored segment) if len(sys.argv) != 4: - print 'Usage: %s eval-in ctm-in ctm-eval-out' % __file__ + print('Usage: %s eval-in ctm-in ctm-eval-out' % __file__) sys.exit(1) dummy, eval_in, ctm_in, ctm_eval_out = sys.argv @@ -54,7 +55,7 @@ # Build the 'ctm' with 'eval' column added, ctm_eval = [] -for utt,ctm_part in ctm.iteritems(): +for utt,ctm_part in ctm.items(): ctm_part.sort(key = operator.itemgetter(2)) # Sort by 'beg' time, try: # merging 'tuples' by '+', the record has format: @@ -69,7 +70,7 @@ # append, ctm_eval.extend(merged) except KeyError: - print 'Missing key', utt, 'in the word-evaluation stats from scoring' + print('Missing key', utt, 'in the word-evaluation stats from scoring') # Sort again, ctm_eval.sort(key = operator.itemgetter(0,1,2)) diff --git a/egs/wsj/s5/steps/conf/append_prf_to_ctm.py b/egs/wsj/s5/steps/conf/append_prf_to_ctm.py index 547b6176c9f..42acc5e22b7 100755 --- a/egs/wsj/s5/steps/conf/append_prf_to_ctm.py +++ b/egs/wsj/s5/steps/conf/append_prf_to_ctm.py @@ -3,6 +3,7 @@ # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 +from __future__ import print_function import sys # Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM': @@ -16,7 +17,7 @@ # Parse options, if len(sys.argv) != 4: - print "Usage: %s prf ctm_in ctm_out" % __file__ + print("Usage: %s prf ctm_in ctm_out" % __file__) sys.exit(1) prf_file, ctm_file, ctm_out_file = sys.argv[1:] diff --git a/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py b/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py index 8fec0064fd7..25899e19264 100755 --- a/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py +++ b/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py @@ -3,6 +3,7 @@ # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 +from __future__ import print_function import sys, operator # This scripts loads a 'ctm' file and converts it into the 'tra' format: @@ -14,7 +15,7 @@ # - confidences if len(sys.argv) != 3: - print 'Usage: %s ctm-in tra-out' % __file__ + print('Usage: %s ctm-in tra-out' % __file__) sys.exit(1) dummy, ctm_in, tra_out = sys.argv @@ -31,7 +32,7 @@ # Store the in 'tra' format, with open(tra_out,'w') as f: - for utt,tuples in tra.iteritems(): + for utt,tuples in tra.items(): tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time, f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples]))) diff --git a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py index 1be32d4c4d7..f0a2fe13497 100755 --- a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py +++ b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py @@ -3,11 +3,12 @@ # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 +from __future__ import print_function import sys, gzip, re # Parse options, if len(sys.argv) != 4: - print "Usage: %s " % __file__ + print("Usage: %s " % __file__) sys.exit(0) words_txt, arpa_gz, unigrams_out = sys.argv[1:] @@ -31,7 +32,7 @@ # Create list, 'wrd id log_p_unigram', words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ] -print >>sys.stderr, words_unigram[0] +print(words_unigram[0], file=sys.stderr) # Store, with open(unigrams_out,'w') as f: f.writelines(['%s %s %g\n' % (w,i,p) for (w,i,p) in words_unigram]) diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py index bc8f92a2f7f..c4da720ba71 100755 --- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py +++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py @@ -3,6 +3,7 @@ # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 +from __future__ import division import sys, math from optparse import OptionParser @@ -82,7 +83,7 @@ depths = dict() for l in open(o.lattice_depth): utt,d = l.split(' ',1) - depths[utt] = map(int,d.split()) + depths[utt] = [int(i) for i in d.split()] # Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt', wrd_to_cat = [ l.split() for l in open(word_categories_file) ] diff --git a/egs/wsj/s5/steps/data/augment_data_dir.py b/egs/wsj/s5/steps/data/augment_data_dir.py index 432b136e3b1..7edcdda2636 100755 --- a/egs/wsj/s5/steps/data/augment_data_dir.py +++ b/egs/wsj/s5/steps/data/augment_data_dir.py @@ -103,8 +103,8 @@ def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \ tot_noise_dur += noise_dur + interval noises.append(noise) - start_times_str = "--start-times='" + ",".join(list(map(str,start_times))) + "'" - snrs_str = "--snrs='" + ",".join(list(map(str,snrs))) + "'" + start_times_str = "--start-times='" + ",".join([str(i) for i in start_times]) + "'" + snrs_str = "--snrs='" + ",".join([str(i) for i in snrs]) + "'" noises_str = "--additive-signals='" + ",".join(noises).strip() + "'" # If the wav is just a file @@ -130,11 +130,11 @@ def CopyFileIfExists(utt_suffix, filename, input_dir, output_dir): def main(): args = GetArgs() - fg_snrs = list(map(int, args.fg_snr_str.split(":"))) - bg_snrs = list(map(int, args.bg_snr_str.split(":"))) + fg_snrs = [int(i) for i in args.fg_snr_str.split(":")] + bg_snrs = [int(i) for i in args.bg_snr_str.split(":")] input_dir = args.input_dir output_dir = args.output_dir - num_bg_noises = list(map(int, args.num_bg_noises.split(":"))) + num_bg_noises = [int(i) for i in args.num_bg_noises.split(":")] reco2dur = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) wav_scp_file = open(input_dir + "/wav.scp", 'r').readlines() diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 570613855a0..189f4619ddb 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -5,7 +5,6 @@ # script to generate reverberated data # we're using python 3.x style print but want it to work in python 2.x, -from __future__ import print_function import argparse, shlex, glob, math, os, random, sys, warnings, copy, imp, ast data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py') @@ -121,17 +120,18 @@ def CheckArgs(args): return args -class list_cyclic_iterator: +class list_cyclic_iterator(object): def __init__(self, list): self.list_index = 0 self.list = list random.shuffle(self.list) - def next(self): + def __next__(self): item = self.list[self.list_index] self.list_index = (self.list_index + 1) % len(self.list) return item + next = __next__ # for Python 2 # This functions picks an item from the collection according to the associated probability distribution. # The probability estimate of each item in the collection is stored in the "probability" field of @@ -218,11 +218,11 @@ def AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the in if noise.bg_fg_type == "background": noise_rvb_command = """wav-reverberate --impulse-response="{0}" --duration={1}""".format(noise_rir.rir_rspecifier, speech_dur) noise_addition_descriptor['start_times'].append(0) - noise_addition_descriptor['snrs'].append(background_snrs.next()) + noise_addition_descriptor['snrs'].append(next(background_snrs)) else: noise_rvb_command = """wav-reverberate --impulse-response="{0}" """.format(noise_rir.rir_rspecifier) noise_addition_descriptor['start_times'].append(round(random.random() * speech_dur, 2)) - noise_addition_descriptor['snrs'].append(foreground_snrs.next()) + noise_addition_descriptor['snrs'].append(next(foreground_snrs)) # check if the rspecifier is a pipe or not if len(noise.noise_rspecifier.split()) == 1: @@ -273,7 +273,7 @@ def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to else: noise_addition_descriptor['noise_io'].append("{0} wav-reverberate --duration={1} - - |".format(isotropic_noise.noise_rspecifier, speech_dur)) noise_addition_descriptor['start_times'].append(0) - noise_addition_descriptor['snrs'].append(background_snrs.next()) + noise_addition_descriptor['snrs'].append(next(background_snrs)) noise_addition_descriptor = AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the information of the noise added room, # the room selected diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py b/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py index 56b9f69b3c9..6ed2bf78115 100755 --- a/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py +++ b/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py @@ -5,6 +5,7 @@ # Apache 2.0. from __future__ import print_function +from __future__ import division import argparse import sys, os from collections import defaultdict diff --git a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py index a5bdbc30d46..f8568971fb7 100755 --- a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py +++ b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py @@ -10,7 +10,7 @@ def GetArgs(): parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon" "to produce a learned lexicon.", - epilog = "See steps/dict/learn_lexicon.sh for example") + epilog = "See steps/dict/learn_lexicon_greedy.sh for example") parser.add_argument("in_lexicon", metavar='', type = str, help = "Input lexicon. Each line must be .") diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py index b5202a69abb..e8106bdd1ac 100755 --- a/egs/wsj/s5/steps/dict/get_pron_stats.py +++ b/egs/wsj/s5/steps/dict/get_pron_stats.py @@ -10,15 +10,16 @@ import sys def GetArgs(): - parser = argparse.ArgumentParser(description = "Accumulate statistics from lattice-alignment outputs for lexicon" - "learning. The inputs are a file containing arc level information from lattice-align-words," - "and a map which maps word-position-dependent phones to word-position-independent phones" - "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts" - "of pronunciations", - epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|" - " steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\" - " exp/tri3_lex_0.4_work/lats/pron_stats.txt" - "See steps/dict/learn_lexicon.sh for examples in detail.") + parser = argparse.ArgumentParser( + description = "Accumulate statistics from lattice-alignment outputs for lexicon" + "learning. The inputs are a file containing arc level information from lattice-align-words," + "and a map which maps word-position-dependent phones to word-position-independent phones" + "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts" + "of pronunciations", + epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|" + " steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\" + " exp/tri3_lex_0.4_work/lats/pron_stats.txt" + "See steps/dict/learn_lexicon_greedy.sh for examples in detail.") parser.add_argument("arc_info_file", metavar = "", type = str, help = "Input file containing per arc statistics; " @@ -75,14 +76,14 @@ def GetStatsFromArcInfo(arc_info_file_handle, phone_map_handle): prons[word].add(phones) stats_unmapped[(word, phones)] = stats_unmapped.get((word, phones), 0) + count - for word_pron, count in stats_unmapped.iteritems(): + for word_pron, count in stats_unmapped.items(): phones_unmapped = word_pron[1].split() phones = [phone_map[phone] for phone in phones_unmapped] stats[(word_pron[0], " ".join(phones))] = count return stats def WriteStats(stats, file_handle): - for word_pron, count in stats.iteritems(): + for word_pron, count in stats.items(): print('{2} {0} {1}'.format(word_pron[0], word_pron[1], count), file=file_handle) file_handle.close() diff --git a/egs/wsj/s5/steps/dict/internal/get_subsegments.py b/egs/wsj/s5/steps/dict/internal/get_subsegments.py new file mode 100755 index 00000000000..c431b4c7066 --- /dev/null +++ b/egs/wsj/s5/steps/dict/internal/get_subsegments.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0. + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import argparse +import sys +import string + +def GetArgs(): + parser = argparse.ArgumentParser( + description = "The purpose of this script is to use a ctm and a vocab file" + "to extract sub-utterances and a sub-segmentation. Extracted sub-utterances" + "are all the strings of consecutive in-vocab words from the ctm" + "surrounded by an out-of-vocab word at each end if present.", + epilog = "e.g. steps/dict/internal/get_subsegments.py exp/tri3_lex_0.4_work/phonetic_decoding/word.ctm \\" + "exp/tri3_lex_0.4_work/learn_vocab.txt exp/tri3_lex_0.4_work/resegmentation/subsegments \\" + "exp/tri3_lex_0.4_work/resegmentation/text" + "See steps/dict/learn_lexicon_greedy.sh for an example.") + + parser.add_argument("ctm", metavar='', type = str, + help = "Input ctm file." + "each line must be ") + parser.add_argument("vocab", metavar='', type = str, + help = "Vocab file." + "each line must be ") + parser.add_argument("subsegment", metavar='', type = str, + help = "Subsegment file. Each line is in format:" + " ") + parser.add_argument("text", metavar='', type = str, + help = "Text file. Each line is in format:" + " ... .") + + print (' '.join(sys.argv), file = sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.ctm == "-": + args.ctm_handle = sys.stdin + else: + args.ctm_handle = open(args.ctm) + + if args.vocab is not '': + if args.vocab == "-": + args.vocab_handle = sys.stdout + else: + args.vocab_handle = open(args.vocab) + + args.subsegment_handle = open(args.subsegment, 'w') + args.text_handle = open(args.text, 'w') + + return args + +def GetSubsegments(args, vocab): + sub_utt = list() + last_is_oov = False + is_oov = False + utt_id_last = None + start_times = {} + end_times = {} + sub_utts = {} + sub_utt_id = 1 + sub_utt_id_last = 1 + end_time_last = 0.0 + for line in args.ctm_handle: + splits = line.strip().split() + if len(splits) < 5: + raise Exception("problematic line",line) + + utt_id = splits[0] + start = float(splits[2]) + dur = float(splits[3]) + word = splits[4] + if utt_id != utt_id_last: + sub_utt_id = 1 + if len(sub_utt)>1: + sub_utts[utt_id_last+'-'+str(sub_utt_id_last)] = (utt_id_last, sub_utt) + end_times[utt_id_last+'-'+str(sub_utt_id_last)] = ent_time_last + sub_utt = [] + start_times[utt_id+'-'+str(sub_utt_id)] = start + is_oov_last = False + if word == '': + is_oov = True + end_times[utt_id+'-'+str(sub_utt_id)] = start + dur + elif word in vocab: + is_oov = True + sub_utt.append(word) + end_times[utt_id+'-'+str(sub_utt_id)] = start + dur + else: + is_oov = False + if is_oov_last == True: + sub_utt.append(word) + sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt) + end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur + sub_utt_id += 1 + sub_utt = [word] + start_times[utt_id+'-'+str(sub_utt_id)] = start + utt_id_last = utt_id + sub_utt_id_last = sub_utt_id + is_oov_last = is_oov + ent_time_last = start + dur + + if is_oov: + if word != '': + sub_utt.append(word) + sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt) + end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur + + for utt,v in sorted(sub_utts.items()): + print(utt, ' '.join(sub_utts[utt][1]), file=args.text_handle) + print(utt, sub_utts[utt][0], start_times[utt], end_times[utt], file=args.subsegment_handle) + +def ReadVocab(vocab_file_handle): + vocab = set() + if vocab_file_handle: + for line in vocab_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) > 1: + raise Exception('Invalid format of line ' + line + + ' in vocab file.') + word = splits[0] + vocab.add(word) + return vocab + +def Main(): + args = GetArgs() + + vocab = ReadVocab(args.vocab_handle) + GetSubsegments(args, vocab) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py index 1f2863424f3..60c7f75bbe8 100755 --- a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py +++ b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2016 Xiaohui Zhang +# Copyright 2018 Xiaohui Zhang # Apache 2.0. from __future__ import print_function @@ -10,27 +10,36 @@ import math def GetArgs(): - parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment" - "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation" - "cadidates according to their soft-counts, and then select the top r * N candidates" - "(For words in the reference lexicon, N = # pron variants given by the reference" - "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)." - "r is a user-specified constant, like 2.", - epilog = "See steps/dict/learn_lexicon.sh for example") - - parser.add_argument("--r", type = float, default = "2.0", - help = "a user-specified ratio parameter which determines how many" - "pronunciation candidates we want to keep for each word.") + parser = argparse.ArgumentParser( + description = "Prune pronunciation candidates based on soft-counts from lattice-alignment" + "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation" + "cadidates according to their soft-counts, and then select the top variant-counts-ratio * N candidates" + "(For words in the reference lexicon, N = # pron variants given by the reference" + "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon).", + epilog = "See steps/dict/learn_lexicon_greedy.sh for example") + + parser.add_argument("--variant-counts-ratio", type = float, default = "3.0", + help = "A user-specified ratio parameter which determines how many" + "pronunciation candidates we want to keep for each word at most.") parser.add_argument("pron_stats", metavar = "", type = str, - help = "File containing soft-counts of all pronounciation candidates; " + help = "File containing soft-counts of pronounciation candidates; " "each line must be ") + parser.add_argument("lexicon_phonetic_decoding", metavar = "", type = str, + help = "Lexicon containing pronunciation candidates from phonetic decoding." + "each line must be ") + parser.add_argument("lexiconp_g2p", metavar = "", type = str, + help = "Lexicon with probabilities for pronunciation candidates from G2P." + "each line must be ") parser.add_argument("ref_lexicon", metavar = "", type = str, help = "Reference lexicon file, where we obtain # pron variants for" "each word, based on which we prune the pron candidates." "Each line must be ") - parser.add_argument("pruned_prons", metavar = "", type = str, - help = "An output file in lexicon format, which contains prons we want to" - "prune off from the pron_stats file.") + parser.add_argument("lexicon_phonetic_decoding_pruned", metavar = "", type = str, + help = "Output lexicon containing pronunciation candidates from phonetic decoding after pruning." + "each line must be ") + parser.add_argument("lexicon_g2p_pruned", metavar = "", type = str, + help = "Output lexicon containing pronunciation candidates from G2P after pruning." + "each line must be ") print (' '.join(sys.argv), file=sys.stderr) @@ -40,12 +49,13 @@ def GetArgs(): return args def CheckArgs(args): + print(args) args.pron_stats_handle = open(args.pron_stats) + args.lexicon_phonetic_decoding_handle = open(args.lexicon_phonetic_decoding) + args.lexiconp_g2p_handle = open(args.lexiconp_g2p) args.ref_lexicon_handle = open(args.ref_lexicon) - if args.pruned_prons == "-": - args.pruned_prons_handle = sys.stdout - else: - args.pruned_prons_handle = open(args.pruned_prons, "w") + args.lexicon_phonetic_decoding_pruned_handle = open(args.lexicon_phonetic_decoding_pruned, "w") + args.lexicon_g2p_pruned_handle = open(args.lexicon_g2p_pruned, "w") return args def ReadStats(pron_stats_handle): @@ -62,13 +72,11 @@ def ReadStats(pron_stats_handle): phones = ' '.join(splits[2:]) stats[word].append((phones, count)) - for word, entry in stats.iteritems(): - entry.sort(key=lambda x: x[1]) return stats -def ReadLexicon(ref_lexicon_handle): - ref_lexicon = defaultdict(set) - for line in ref_lexicon_handle.readlines(): +def ReadLexicon(lexicon_handle): + lexicon = defaultdict(set) + for line in lexicon_handle.readlines(): splits = line.strip().split() if len(splits) == 0: continue @@ -77,42 +85,74 @@ def ReadLexicon(ref_lexicon_handle): + ' in lexicon file.') word = splits[0] phones = ' '.join(splits[1:]) - ref_lexicon[word].add(phones) - return ref_lexicon + lexicon[word].add(phones) + return lexicon -def PruneProns(args, stats, ref_lexicon): +def ReadLexiconp(lexiconp_handle): + lexicon = defaultdict(set) + pron_probs = defaultdict(float) + for line in lexiconp_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 3: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[1] + prob = float(splits[0]) + phones = ' '.join(splits[2:]) + pron_probs[(word, phones)] = prob + lexicon[word].add(phones) + return lexicon, pron_probs + +def PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs): + # For those pron candidates from lexicon_phonetic_decoding/g2p which don't + # have stats, we append them to the "stats" dict, with a zero count. + for word, entry in stats.iteritems(): + prons_with_stats = set() + for (pron, count) in entry: + prons_with_stats.add(pron) + for pron in lexicon_g2p[word]: + if pron not in prons_with_stats: + entry.append((pron, lexicon_g2p_probs[(word, pron)]-1.0)) + entry.sort(key=lambda x: x[1]) + # Compute the average # pron variants counts per word in the reference lexicon. num_words_ref = 0 num_prons_ref = 0 for word, prons in ref_lexicon.iteritems(): num_words_ref += 1 num_prons_ref += len(prons) - avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref)) - + avg_variant_counts_ref = round(float(num_prons_ref) / float(num_words_ref)) for word, entry in stats.iteritems(): if word in ref_lexicon: - variants_counts = args.r * len(ref_lexicon[word]) + variant_counts = args.variant_counts_ratio * len(ref_lexicon[word]) else: - variants_counts = args.r * avg_variants_counts_ref + variant_counts = args.variant_counts_ratio * avg_variant_counts_ref num_variants = 0 - while num_variants < variants_counts: + count = 0.0 + while num_variants < variant_counts: try: - pron, prob = entry.pop() - if word not in ref_lexicon or pron not in ref_lexicon[word]: + pron, count = entry.pop() + if word in ref_lexicon and pron in ref_lexicon[word]: + continue + if pron in lexicon_phonetic_decoding[word]: + num_variants += 1 + print('{0} {1}'.format(word, pron), file=args.lexicon_phonetic_decoding_pruned_handle) + if pron in lexicon_g2p[word]: num_variants += 1 + print('{0} {1}'.format(word, pron), file=args.lexicon_g2p_pruned_handle) except IndexError: break - - for word, entry in stats.iteritems(): - for pron, prob in entry: - if word not in ref_lexicon or pron not in ref_lexicon[word]: - print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle) def Main(): args = GetArgs() ref_lexicon = ReadLexicon(args.ref_lexicon_handle) + lexicon_phonetic_decoding = ReadLexicon(args.lexicon_phonetic_decoding_handle) + lexicon_g2p, lexicon_g2p_probs = ReadLexiconp(args.lexiconp_g2p_handle) stats = ReadStats(args.pron_stats_handle) - PruneProns(args, stats, ref_lexicon) + + PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs) if __name__ == "__main__": Main() diff --git a/egs/wsj/s5/steps/dict/internal/sum_arc_info.py b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py new file mode 100755 index 00000000000..5f02bc5fc29 --- /dev/null +++ b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0 + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys + +class StrToBoolAction(argparse.Action): + """ A custom action to convert bools from shell format i.e., true/false + to python format i.e., True/False """ + def __call__(self, parser, namespace, values, option_string=None): + if values == "true": + setattr(namespace, self.dest, True) + elif values == "false": + setattr(namespace, self.dest, False) + else: + raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) + + +def GetArgs(): + parser = argparse.ArgumentParser( + description = "Accumulate statistics from per arc lattice statitics" + "for lexicon learning", + epilog = "See steps/dict/learn_lexicon_greedy.sh for example") + + parser.add_argument("--set-sum-to-one", type = str, default = True, + action = StrToBoolAction, choices = ["true", "false"], + help = "If normalize posteriors such that the sum of " + "pronunciation posteriors of a word in an utterance is 1.") + parser.add_argument("arc_info_file", metavar = "", type = str, + help = "File containing per arc statistics; " + "each line must be " + "") + parser.add_argument("phone_map", metavar = "", type = str, + help = "An input phone map used to remove word boundary markers from phones;" + "generated in steps/cleanup/debug_lexicon.sh") + parser.add_argument("stats_file", metavar = "", type = str, + help = "Write accumulated statitistics to this file" + "each line is " + "") + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.arc_info_file == "-": + args.arc_info_file_handle = sys.stdin + else: + args.arc_info_file_handle = open(args.arc_info_file) + + args.phone_map_handle = open(args.phone_map) + + if args.stats_file == "-": + args.stats_file_handle = sys.stdout + else: + args.stats_file_handle = open(args.stats_file, "w") + + return args + +def Main(): + args = GetArgs() + + lexicon = defaultdict(list) + prons = defaultdict(list) + start_frames = {} + stats = defaultdict(lambda : defaultdict(float)) + sum_tot = defaultdict(float) + + phone_map = {} + for line in args.phone_map_handle.readlines(): + splits = line.strip().split() + phone_map[splits[0]] = splits[1] + + for line in args.arc_info_file_handle.readlines(): + splits = line.strip().split() + + if (len(splits) == 0): + continue + + if (len(splits) < 6): + raise Exception('Invalid format of line ' + line + + ' in ' + args.arc_info_file) + + utt = splits[0] + start_frame = int(splits[1]) + word = splits[4] + count = float(splits[3]) + phones_unmapped = splits[5:] + phones = [phone_map[phone] for phone in phones_unmapped] + phones = ' '.join(phones) + overlap = False + if word == '': + continue + if (word, utt) not in start_frames: + start_frames[(word, utt)] = start_frame + + if (word, utt) in stats: + stats[word, utt][phones] = stats[word, utt].get(phones, 0) + count + else: + stats[(word, utt)][phones] = count + sum_tot[(word, utt)] += count + + if phones not in prons[word]: + prons[word].append(phones) + + for (word, utt) in stats: + count_sum = 0.0 + counts = dict() + for phones in stats[(word, utt)]: + count = stats[(word, utt)][phones] + count_sum += count + counts[phones] = count + # By default we normalize the pron posteriors of each word in each utterance, + # so that they sum up exactly to one. If a word occurs two times in a utterance, + # the effect of this operation is to average the posteriors of these two occurences + # so that there's only one "equivalent occurence" of this word in the utterance. + # However, this case should be extremely rare if the utterances are already + # short sub-utterances produced by steps/dict/internal/get_subsegments.py + for phones in stats[(word, utt)]: + count = counts[phones] / count_sum + print(word, utt, start_frames[(word, utt)], count, phones, file=args.stats_file_handle) + # # Diagnostics info implying incomplete arc_info or multiple occurences of a word in a utterance: + # if count_sum < 0.9 or count_sum > 1.1: + # print(word, utt, start_frame, count_sum, stats[word, utt], file=sys.stderr) + + args.stats_file_handle.close() + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/learn_lexicon.sh b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh similarity index 93% rename from egs/wsj/s5/steps/dict/learn_lexicon.sh rename to egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh index a719422b593..042f8f94da4 100755 --- a/egs/wsj/s5/steps/dict/learn_lexicon.sh +++ b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh @@ -36,6 +36,7 @@ oov_symbol= lexicon_g2p= min_prob=0.3 +variant_counts_ratio=8 variants_prob_mass=0.7 variants_prob_mass_ref=0.9 @@ -93,6 +94,10 @@ if [ $# -lt 6 ] || [ $# -gt 7 ]; then echo " --min-prob # The cut-off parameter used to select pronunciation candidates from phonetic" echo " # decoding. We remove pronunciations with probabilities less than this value" echo " # after normalizing the probs s.t. the max-prob is 1.0 for each word." + echo " --variant-counts-ratio # This ratio parameter determines the maximum number of pronunciation" + echo " # candidates we will keep for each word, after pruning according to lattice statistics from" + echo " # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py" + echo " # for details." echo " --prior-mean # Mean of priors (summing up to 1) assigned to three exclusive pronunciation" echo " # source: reference lexicon, g2p, and phonetic decoding (used in the Bayesian" echo " # pronunciation selection procedure). We recommend setting a larger prior" @@ -150,17 +155,17 @@ if [ $stage -le 0 ]; then # Remove non-scored-words from the reference lexicon. awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ - $ref_dict/lexicon.txt | tr -s '\t' ' ' > $dir/ref_lexicon.txt + $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ $target_vocab | sort | uniq > $dir/target_vocab.txt # From the reference lexicon, we estimate the target_num_prons_per_word as, - # ceiling(avg. # prons per word in the reference lexicon). This'll be used as + # round(avg. # prons per word in the reference lexicon). This'll be used as # the upper bound of # pron variants per word when we apply G2P or select prons to # construct the learned lexicon in later stages. - python -c 'import sys; import math; print int(math.ceil(float(sys.argv[1])/float(sys.argv[2])))' \ + python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \ `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \ > $dir/target_num_prons_per_word || exit 1; @@ -225,10 +230,11 @@ if [ $stage -le 2 ]; then # Get the oov words list (w.r.t ref vocab) which are in training data. awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \ - $dir/train_counts.txt | sort > $dir/oov_train.txt + $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \ + $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \ - $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate + $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1; echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:" cat $dir/train_oov_rate @@ -237,14 +243,14 @@ if [ $stage -le 2 ]; then # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on. awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \ - $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt + $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt || exit 1; # Get the pronunciation of oov_symbol. - oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | cut -f2- -d' '` + oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` # For oov words in training data for which we don't even have G2P pron candidates, # we simply assign them the pronunciation of the oov symbol (like ). awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \ - $dir/oov_train.txt | awk -v op=$oov_pron '{print $0" "op}' > $dir/oov_train_no_pron.txt + $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1; cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \ awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ @@ -263,7 +269,7 @@ if [ $stage -le 3 ]; then # We prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob", # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon. - cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt > $dir/phonetic_decoding/filter_lexicon.txt + cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \ --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \ @@ -295,7 +301,7 @@ if [ $stage -le 4 ]; then # Generate lattices for the acoustic training data with the combined lexicon. if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi - steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \ + steps/align_fmllr_lats.sh --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \ $data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1; # Get arc level information from the lattice. @@ -321,13 +327,10 @@ if [ $stage -le 5 ]; then rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment. - $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py $dir/lats_iter1/pron_stats.txt $dir/ref_lexicon.txt $dir/pruned_prons.txt - - awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_phonetic_decoding.txt \ - > $dir/lexicon_phonetic_decoding_pruned.txt - - awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_g2p.txt \ - > $dir/lexicon_g2p_pruned.txt \ + $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \ + --variant-counts-ratio $variant_counts_ratio \ + $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \ + $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt # Filter out words which don't appear in the acoustic training data cat $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt \ @@ -402,7 +405,7 @@ if [ $stage -le 7 ]; then # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any. cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \ awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \ - $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt + $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1; awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \ $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_no_acoustics.txt @@ -426,5 +429,5 @@ if [ $stage -le 8 ]; then echo " ... sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon." cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \ - sort | uniq > $dest_dict/lexicon.txt + sort | uniq > $dest_dict/lexicon.txt || exit 1; fi diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh new file mode 100755 index 00000000000..56e85f20d62 --- /dev/null +++ b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh @@ -0,0 +1,546 @@ +#! /bin/bash + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0 + +# This recipe has similar inputs and outputs as steps/dict/learn_lexicon.sh +# The major difference is, instead of using a Bayesian framework for +# pronunciation selection, we used a likelihood-reduction based greedy +# pronunciation selection framework presented in the paper: +# "Acoustic data-driven lexicon learning based on a greedy pronunciation " +# "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur," +# "Interspeech 2017." + +# This script demonstrate how to expand a existing lexicon using a combination +# of acoustic evidence and G2P to learn a lexicon that covers words in a target +# vocab, and agrees sufficiently with the acoustics. The basic idea is to +# run phonetic decoding on acoustic training data using an existing +# acoustice model (possibly re-trained using a G2P-expanded lexicon) to get +# alternative pronunciations for words in training data. Then we combine three +# exclusive sources of pronunciations: the reference lexicon (supposedly +# hand-derived), phonetic decoding, and G2P (optional) into one lexicon and then run +# lattice alignment on the same data, to collect acoustic evidence (soft +# counts) of all pronunciations. Based on these statistics, we use a greedy +# framework (see steps/dict/select_prons_greedy.sh for details) to select an +# informative subset of pronunciations for each word with acoustic evidence. +# two important parameters are alpha and beta. Basically, the three dimensions of alpha +# and beta correspond to three pronunciation sources: phonetic-decoding, G2P and +# the reference lexicon, and the larger a value is, the more aggressive we'll +# prune pronunciations from that sooure. The valid range of each dim. is [0, 1] +# (for alpha, and 0 means we never pruned pron from that source.) [0, 100] (for beta). +# The output of steps/dict/select_prons_greedy.sh is a learned lexicon whose vocab +# matches the user-specified target-vocab, and two intermediate outputs which were +# used to generate the learned lexicon: an edits file which records the recommended +# changes to all in-ref-vocab words' prons, and a half-learned lexicon +# ($dest_dict/lexicon0.txt) where all in-ref-vocab words' prons were untouched +# (on top of which we apply the edits file to produce the final learned lexicon). +# The user can always modify the edits file manually and then re-apply it on the +# half-learned lexicon using steps/dict/apply_lexicon_edits.sh to produce the +# final learned lexicon. See the last stage in this script for details. + +stage=0 +# Begin configuration section. +cmd=run.pl +nj= +stage=0 +oov_symbol= +lexiconp_g2p= +min_prob=0.3 +variant_counts_ratio=8 +variant_counts_no_acoustics=1 +alpha="0,0,0" +beta="0,0,0" +delta=0.0000001 +num_gauss= +num_leaves= +retrain_src_mdl=true +cleanup=true +nj_select_prons=200 +learn_iv_prons=false # whether we want to learn the prons of IV words (w.r.t. ref_vocab), + +# End configuration section. + +. ./path.sh +. utils/parse_options.sh + +if [ $# -lt 6 ] || [ $# -gt 7 ]; then + echo "Usage: $0 [options] \\" + echo " ." + echo " This script does lexicon expansion using a combination of acoustic" + echo " evidence and G2P to produce a lexicon that covers words of a target vocab:" + echo "" + echo "Arguments:" + echo " The dir which contains the reference lexicon (most probably hand-derived)" + echo " we want to expand/improve, and nonsilence_phones.txt,.etc which we need " + echo " for building new dict dirs." + echo " The vocabulary we want the final learned lexicon to cover (one word per line)." + echo " acoustic training data we use to get alternative" + echo " pronunciations and collet acoustic evidence." + echo " The dir containing an SAT-GMM acoustic model (we optionaly we re-train it" + echo " using G2P expanded lexicon) to do phonetic decoding (to get alternative" + echo " pronunciations) and lattice-alignment (to collect acoustic evidence for" + echo " evaluating all prounciations)" + echo " The reference lang dir which we use to get non-scored-words" + echo " like for building new dict dirs" + echo " The dict dir where we put the final learned lexicon, whose vocab" + echo " matches ." + echo " The dir which contains all the intermediate outputs of this script." + echo "" + echo "Note: and the vocab of don't have to match. For words" + echo " who are in but not seen in , their pronunciations" + echo " will be given by G2P at the end." + echo "" + echo "e.g. $0 data/local/dict data/local/lm/librispeech-vocab.txt data/train \\" + echo " exp/tri3 data/lang data/local/dict_learned" + echo "Options:" + echo " --stage # stage to run from, to enable resuming from partially" + echo " # completed run (default: 0)" + echo " --cmd '$cmd' # command to submit jobs with (e.g. run.pl, queue.pl)" + echo " --nj # number of parallel jobs" + echo " --oov-symbol '$oov_symbol' # oov symbol, like ." + echo " --lexiconp-g2p # a lexicon (with prob in the second column) file containing g2p generated" + echo " # pronunciations, for words in acoustic training data / target vocabulary. It's optional." + echo " --min-prob # The cut-off parameter used to select pronunciation candidates from phonetic" + echo " # decoding. We remove pronunciations with probabilities less than this value" + echo " # after normalizing the probs s.t. the max-prob is 1.0 for each word." + echo " --variant-counts-ratio # This ratio parameter determines the maximum number of pronunciation" + echo " # candidates we will keep for each word, after pruning according to lattice statistics from" + echo " # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py" + echo " # for details." + echo " --variant-counts-no-acoustics # how many g2p-prons per word we want to include for each words unseen in acoustic training data." + echo " --alpha ,, # scaling factors used in the greedy pronunciation selection framework, " + echo " # see steps/dict/select_prons_greedy.py for details." + echo " --beta ,, # smoothing factors used in the greedy pronunciation selection framework, " + echo " # see steps/dict/select_prons_greedy.py for details." + echo " --delta # a floor value used in the greedy pronunciation selection framework, " + echo " # see steps/dict/select_prons_greedy.py for details." + echo " --num-gauss # number of gaussians for the re-trained SAT model (on top of )." + echo " --num-leaves # number of leaves for the re-trained SAT model (on top of )." + echo " --retrain-src-mdl # true if you want to re-train the src_mdl before phone decoding (default false)." + exit 1 +fi + +echo "$0 $@" # Print the command line for logging + +ref_dict=$1 +target_vocab=$2 +data=$3 +src_mdl_dir=$4 +ref_lang=$5 +dest_dict=$6 + +if [ -z "$oov_symbol" ]; then + echo "$0: the --oov-symbol option is required." + exit 1 +fi + +if [ $# -gt 6 ]; then + dir=$7 # Most intermediate outputs will be put here. +else + dir=${src_mdl_dir}_lex_learn_work +fi + +mkdir -p $dir +if [ $stage -le 0 ]; then + echo "$0: Some preparatory work." + # Get the word counts of training data. + awk '{for (n=2;n<=NF;n++) counts[$n]++;} END{for (w in counts) printf "%s %d\n",w, counts[w];}' \ + $data/text | sort > $dir/train_counts.txt + + # Get the non-scored entries and exclude them from the reference lexicon/vocab, and target_vocab. + steps/cleanup/internal/get_non_scored_words.py $ref_lang > $dir/non_scored_words + awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/non_scored_words \ + $ref_dict/lexicon.txt > $dir/non_scored_entries + + # Remove non-scored-words from the reference lexicon. + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ + $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt + + cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ + $target_vocab | sort | uniq > $dir/target_vocab.txt + + # From the reference lexicon, we estimate the target_num_prons_per_word as, + # round(avg. # prons per word in the reference lexicon). This'll be used as + # the upper bound of # pron variants per word when we apply G2P or select prons to + # construct the learned lexicon in later stages. + python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \ + `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \ + > $dir/target_num_prons_per_word || exit 1; + + if [ -z $lexiconp_g2p ]; then + # create an empty list of g2p generated prons, if it's not given. + touch $dir/lexicon_g2p.txt + touch $dir/lexiconp_g2p.txt + else + # Exchange the 1st column (word) and 2nd column (prob) and remove pronunciations + # which are already in the reference lexicon. + cat $lexiconp_g2p | awk '{a=$1;b=$2; $1="";$2="";print b" "a$0}' | \ + awk 'NR==FNR{a[$0] = 1; next} {w=$2;for (n=3;n<=NF;n++) w=w" "$n; if(!(w in a)) print $0}' \ + $dir/ref_lexicon.txt - > $dir/lexiconp_g2p.txt 2>/dev/null + + # make a copy where we remove the first column (probabilities). + cat $dir/lexiconp_g2p.txt | cut -f1,3- > $dir/lexicon_g2p.txt 2>/dev/null + fi + variant_counts=`cat $dir/target_num_prons_per_word` || exit 1; + $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \ + --top-N=$variant_counts $dir/lexiconp_g2p.txt \ + $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1; +fi + +if [ $stage -le 1 ] && $retrain_src_mdl; then + echo "$0: Expand the reference lexicon to cover all words in the target vocab. and then" + echo " ... re-train the source acoustic model for phonetic decoding. " + mkdir -p $dir/dict_expanded_target_vocab + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_expanded_target_vocab 2>/dev/null + rm $dir/dict_expanded_target_vocab/lexiconp.txt $dir/dict_expanded_target_vocab/lexicon.txt 2>/dev/null + + # Get the oov words list (w.r.t ref vocab) which are in the target vocab. + awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $dir/ref_lexicon.txt \ + $dir/target_vocab.txt | sort | uniq > $dir/oov_target_vocab.txt + + # Assign pronunciations from lexicon_g2p.txt to oov_target_vocab. For words which + # cannot be found in lexicon_g2p.txt, we simply ignore them. + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_target_vocab.txt \ + $dir/lexicon_g2p.txt > $dir/lexicon_g2p_oov_target_vocab.txt + + cat $dir/lexicon_g2p_oov_target_vocab.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt - | \ + cat $dir/non_scored_entries - | + sort | uniq > $dir/dict_expanded_target_vocab/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_target_vocab \ + $oov_symbol $dir/lang_expanded_target_vocab_tmp $dir/lang_expanded_target_vocab || exit 1; + + # Align the acoustic training data using the given src_mdl_dir. + alidir=${src_mdl_dir}_ali_$(basename $data) + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $data $dir/lang_expanded_target_vocab $src_mdl_dir $alidir || exit 1; + + # Train another SAT system on the given data and put it in $dir/${src_mdl_dir}_retrained + # this model will be used for phonetic decoding and lattice alignment later on. + if [ -z $num_leaves ] || [ -z $num_gauss ] ; then + echo "num_leaves and num_gauss need to be specified." && exit 1; + fi + steps/train_sat.sh --cmd "$train_cmd" $num_leaves $num_gauss \ + $data $dir/lang_expanded_target_vocab $alidir $dir/${src_mdl_dir}_retrained || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: Expand the reference lexicon to cover all words seen in," + echo " ... acoustic training data, and prepare corresponding dict and lang directories." + echo " ... This is needed when generate pron candidates from phonetic decoding." + mkdir -p $dir/dict_expanded_train + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_expanded_train 2>/dev/null + rm $dir/dict_expanded_train/lexiconp.txt $dir/dict_expanded_train/lexicon.txt 2>/dev/null + + # Get the oov words list (w.r.t ref vocab) which are in training data. + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \ + $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \ + $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; + + awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \ + $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1; + + echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:" + cat $dir/train_oov_rate + + # Assign pronunciations from lexicon_g2p to oov_train. For words which + # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton + # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on. + variant_counts=`cat $dir/target_num_prons_per_word` || exit 1; + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \ + $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_train.txt || exit 1; + + # Get the pronunciation of oov_symbol. + oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` + # For oov words in training data for which we don't even have G2P pron candidates, + # we simply assign them the pronunciation of the oov symbol (like ), + # so that we can get pronunciations for them from phonetic decoding. + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \ + $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1; + + cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat - $dir/non_scored_entries | \ + sort | uniq > $dir/dict_expanded_train/lexicon.txt || exit 1; + + utils/prepare_lang.sh $dir/dict_expanded_train $oov_symbol \ + $dir/lang_expanded_train_tmp $dir/lang_expanded_train || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: Generate pronunciation candidates from phonetic decoding on acoustic training data.." + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + steps/cleanup/debug_lexicon.sh --nj $nj \ + --cmd "$decode_cmd" $data $dir/lang_expanded_train \ + $mdl_dir $dir/dict_expanded_train/lexicon.txt $dir/phonetic_decoding || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: Combine the reference lexicon and pronunciations from phone-decoding/G2P into one" + echo " ... lexicon, and run lattice alignment using this lexicon on acoustic training data" + echo " ... to collect acoustic evidence." + # We first prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob", + # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon. + cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt + + $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \ + --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \ + $dir/phonetic_decoding/prons.txt $dir/lexicon_pd_with_eps.txt + + # We abandon phonetic-decoding candidates for infrequent words. + awk '{if($2 < 3) print $1}' $dir/train_counts.txt > $dir/pd_candidates_to_exclude.txt + awk 'NR==FNR{a[$1] = $2; next} {if(a[$1]<10) print $1}' $dir/train_counts.txt \ + $dir/oov_train_no_pron.txt >> $dir/pd_candidates_to_exclude.txt + + if [ -s $dir/pd_candidates_to_exclude.txt ]; then + cat $dir/lexicon_pd_with_eps.txt | grep -vP "|||\[.*\]" | \ + awk 'NR==FNR{a[$0] = 1; next} {if(!($1 in a)) print $0}' $dir/pd_candidates_to_exclude.txt - | \ + sort | uniq > $dir/lexicon_pd.txt || exit 1; + else + cat $dir/lexicon_pd_with_eps.txt | grep -vP "|||\[.*\]" | \ + sort | uniq > $dir/lexicon_pd.txt || exit 1; + fi + + # Combine the reference lexicon, pronunciations from G2P and phonetic decoding into one lexicon. + mkdir -p $dir/dict_combined_iter1 + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_combined_iter1/ 2>/dev/null + rm $dir/dict_combined_iter1/lexiconp.txt $dir/dict_combined_iter1/lexicon.txt 2>/dev/null + + # Filter out words which don't appear in the acoustic training data + cat $dir/lexicon_pd.txt $dir/lexicon_g2p.txt \ + $dir/ref_lexicon.txt | tr -s '\t' ' ' | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat $dir/non_scored_entries - | \ + sort | uniq > $dir/dict_combined_iter1/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \ + $dir/dict_combined_iter1 $oov_symbol \ + $dir/lang_combined_iter1_tmp $dir/lang_combined_iter1 || exit 1; + + # Generate lattices for the acoustic training data with the combined lexicon. + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + + # Get the vocab for words for which we want to learn pronunciations. + if $learn_iv_prons; then + # If we want to learn the prons of IV words (w.r.t. ref_vocab), the learn_vocab is just the intersection of + # target_vocab and the vocab of words seen in acoustic training data (first col. of train_counts.txt) + awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt \ + > $dir/learn_vocab.txt + else + # Exclude words from the ref_vocab if we don't want to learn the pronunciations of IV words. + awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt | \ + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_vocab.txt - > $dir/learn_vocab.txt + fi + + # In order to get finer lattice stats of alternative prons, we want to make lattices deeper. + # To speed up lattice generation, we use a ctm to create sub-utterances and a sub-segmentation + # for each instance of a word within learn_vocab (or a string of consecutive words within learn_vocab), + # including a single out-of-learn-vocab word at the boundary if present. + mkdir -p $dir/resegmentation + steps/dict/internal/get_subsegments.py $dir/phonetic_decoding/word.ctm $dir/learn_vocab.txt \ + $dir/resegmentation/subsegments $dir/resegmentation/text || exit 1; + utils/data/subsegment_data_dir.sh $data $dir/resegmentation/subsegments $dir/resegmentation/text \ + $dir/resegmentation/data || exit 1; + steps/compute_cmvn_stats.sh $dir/resegmentation/data || exit 1; + + steps/align_fmllr_lats.sh --beam 20 --retry-beam 50 --final-beam 30 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \ + $dir/resegmentation/data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1; + + # Get arc level information from the lattice. + $cmd JOB=1:$nj $dir/lats_iter1/log/get_arc_info.JOB.log \ + lattice-align-words $dir/lang_combined_iter1/phones/word_boundary.int \ + $dir/lats_iter1/final.mdl \ + "ark:gunzip -c $dir/lats_iter1/lat.JOB.gz |" ark:- \| \ + lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter1/final.mdl ark:- - \| \ + utils/int2sym.pl -f 5 $dir/lang_combined_iter1/words.txt \| \ + utils/int2sym.pl -f 6- $dir/lang_combined_iter1/phones.txt '>' \ + $dir/lats_iter1/arc_info_sym.JOB.txt || exit 1; + + # Compute soft counts (pron_stats) of every particular word-pronunciation pair by + # summing up arc level information over all utterances. We'll use this to prune + # pronunciation candidates before the next iteration of lattice generation. + cat $dir/lats_iter1/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \ + $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/pron_stats.txt || exit 1; + + # Accumlate utterance-level pronunciation posteriors (into arc_stats) by summing up + # posteriors of arcs representing the same word & pronunciation and starting + # from roughly the same location. See steps/dict/internal/sum_arc_info.py for details. + for i in `seq 1 $nj`;do + cat $dir/lats_iter1/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \ + steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/arc_info_summed.${i}.txt + done + cat $dir/lats_iter1/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter1/arc_stats.txt + + # Prune the phonetic_decoding lexicon so that any pronunciation that only has non-zero posterior at one word example will be removed. + # The pruned lexicon is put in $dir/lats_iter1. After further pruning in the next stage it'll be put back to $dir. + awk 'NR==FNR{w=$1;for (n=5;n<=NF;n++) w=w" "$n;a[w]+=1;next} {if($0 in a && a[$0]>1) print $0}' \ + $dir/lats_iter1/arc_stats.txt $dir/lexicon_pd.txt > $dir/lats_iter1/lexicon_pd_pruned.txt +fi + +# Here we re-generate lattices (with a wider beam and a pruned combined lexicon) and re-collect pronunciation statistics +if [ $stage -le 5 ]; then + echo "$0: Prune the pronunciation candidates generated from G2P/phonetic decoding, and re-do lattice-alignment." + mkdir -p $dir/dict_combined_iter2 + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_combined_iter2/ 2>/dev/null + rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null + + # Prune away pronunciations which have low acoustic evidence from the first pass of lattice generation. + $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \ + --variant-counts-ratio $variant_counts_ratio \ + $dir/lats_iter1/pron_stats.txt $dir/lats_iter1/lexicon_pd_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \ + $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt + + # Filter out words which don't appear in the acoustic training data. + cat $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt \ + $dir/ref_lexicon.txt | tr -s '\t' ' ' | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat $dir/non_scored_entries - | \ + sort | uniq > $dir/dict_combined_iter2/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \ + $dir/dict_combined_iter2 $oov_symbol \ + $dir/lang_combined_iter2_tmp $dir/lang_combined_iter2 || exit 1; + + # Re-generate lattices with a wider beam, so that we'll get deeper lattices. + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + steps/align_fmllr_lats.sh --beam 30 --retry-beam 60 --final-beam 50 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \ + $dir/resegmentation/data $dir/lang_combined_iter2 $mdl_dir $dir/lats_iter2 || exit 1; + + # Get arc level information from the lattice as we did in the last stage. + $cmd JOB=1:$nj $dir/lats_iter2/log/get_arc_info.JOB.log \ + lattice-align-words $dir/lang_combined_iter2/phones/word_boundary.int \ + $dir/lats_iter2/final.mdl \ + "ark:gunzip -c $dir/lats_iter2/lat.JOB.gz |" ark:- \| \ + lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter2/final.mdl ark:- - \| \ + utils/int2sym.pl -f 5 $dir/lang_combined_iter2/words.txt \| \ + utils/int2sym.pl -f 6- $dir/lang_combined_iter2/phones.txt '>' \ + $dir/lats_iter2/arc_info_sym.JOB.txt || exit 1; + + # Compute soft counts (pron_stats) of every particular word-pronunciation pair as + # we did in the last stage. The stats will only be used as diagnostics. + cat $dir/lats_iter2/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \ + $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/pron_stats.txt || exit 1; + + # Accumlate utterance-level pronunciation posteriors as we did in the last stage. + for i in `seq 1 $nj`;do + cat $dir/lats_iter2/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \ + steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/arc_info_summed.${i}.txt + done + cat $dir/lats_iter2/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter2/arc_stats.txt + + # The pron_stats are the acoustic evidence which the likelihood-reduction-based pronunciation + # selection procedure will be based on. + # Split the utterance-level pronunciation posterior stats into $nj_select_prons pieces, + # so that the following pronunciation selection stage can be parallelized. + numsplit=$nj_select_prons + awk '{print $1"-"$2" "$1}' $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/utt2word + utt2words=$(for n in `seq $numsplit`; do echo $dir/lats_iter2/utt2word.$n; done) + utils/split_scp.pl --utt2spk=$dir/lats_iter2/utt2word $dir/lats_iter2/utt2word $utt2words || exit 1 + for n in `seq $numsplit`; do + (cat $dir/lats_iter2/utt2word.$n | awk '{$1=substr($1,length($2)+2);print $2" "$1}' - > $dir/lats_iter2/word2utt.$n + awk 'NR==FNR{a[$0] = 1; next} {b=$1" "$2; if(b in a) print $0}' $dir/lats_iter2/word2utt.$n \ + $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/arc_stats.${n}.txt + ) & + done + wait +fi + +if [ $stage -le 6 ]; then + echo "$0: Select pronunciations according to the acoustic evidence from lattice alignment." + # Given the acoustic evidence (soft-counts), we use a Bayesian framework to select pronunciations + # from three exclusive candidate sources: reference (hand-derived) lexicon, G2P and phonetic decoding. + # The posteriors for all candidate prons for all words are printed into pron_posteriors.txt + # For words which are out of the ref. vocab, the learned prons are written into out_of_ref_vocab_prons_learned.txt. + # Among them, for words without acoustic evidence, we just ignore them, even if pron candidates from G2P were provided). + # For words in the ref. vocab, we instead output a human readable & editable "edits" file called + # ref_lexicon_edits.txt, which records all proposed changes to the prons (if any). Also, a + # summary is printed into the log file. + + $cmd JOB=1:$nj_select_prons $dir/lats_iter2/log/generate_learned_lexicon.JOB.log \ + steps/dict/select_prons_greedy.py \ + --alpha=${alpha} --beta=${beta} \ + --delta=${delta} \ + $ref_dict/silence_phones.txt $dir/lats_iter2/arc_stats.JOB.txt $dir/train_counts.txt $dir/ref_lexicon.txt \ + $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \ + $dir/lats_iter2/learned_lexicon.JOB.txt || exit 1; + + cat $dir/lats_iter2/learned_lexicon.*.txt > $dir/lats_iter2/learned_lexicon.txt + rm $dir/lats_iter2/learned_lexicon.*.txt + + $cmd $dir/lats_iter2/log/lexicon_learning_summary.log \ + steps/dict/merge_learned_lexicons.py \ + $dir/lats_iter2/arc_stats.txt $dir/train_counts.txt $dir/ref_lexicon.txt \ + $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \ + $dir/lats_iter2/learned_lexicon.txt \ + $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/lats_iter2/ref_lexicon_edits.txt || exit 1; + + cp $dir/lats_iter2/ref_lexicon_edits.txt $dir/lats_iter2/ref_lexicon_edits.txt + # Remove some stuff that takes up space and is unlikely to be useful later on. + if $cleanup; then + rm -r $dir/lats_iter*/{fsts*,lat*} 2>/dev/null + fi +fi + +if [ $stage -le 7 ]; then + echo "$0: Expand the learned lexicon further to cover words in target vocab that are." + echo " ... not seen in acoustic training data." + mkdir -p $dest_dict + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dest_dict 2>/dev/null + rm $dest_dict/lexiconp.txt $dest_dict/lexicon.txt 2>/dev/null + # Get the list of oov (w.r.t. ref vocab) without acoustic evidence, which are in the + # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any. + cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \ + $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1; + + variant_counts=$variant_counts_no_acoustics + + $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \ + --top-N=$variant_counts $dir/lexiconp_g2p.txt \ + $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1; + + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \ + $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_no_acoustics.txt|| exit 1; + + # Get the pronunciation of oov_symbol. + oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` || exit 1; + # For oov words in target_vocab for which we don't even have G2P pron candidates, + # we simply assign them the pronunciation of the oov symbol (like ), + if [ -s $dir/g2p_prons_for_oov_no_acoustics.txt ]; then + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_no_acoustics.txt \ + $dir/oov_no_acoustics.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_target_vocab_no_pron.txt || exit 1; + else + awk -v op="$oov_pron" '{print $0" "op}' $dir/oov_no_acoustics.txt > $dir/oov_target_vocab_no_pron.txt || exit 1 + fi + + # We concatenate three lexicons togethers: G2P lexicon for oov words without acoustics, + # learned lexicon for oov words with acoustics, and the original reference lexicon (for + # this part, later one we'll apply recommended changes using steps/dict/apply_lexicon_edits.py + cat $dir/g2p_prons_for_oov_no_acoustics.txt $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt \ + $dir/oov_target_vocab_no_pron.txt $dir/ref_lexicon.txt | tr -s '\t' ' ' | sort | uniq > $dest_dict/lexicon.temp + + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt \ + $dest_dict/lexicon.temp | sort | uniq > $dest_dict/lexicon.nosil + + cat $dir/non_scored_entries $dest_dict/lexicon.nosil | sort | uniq >$dest_dict/lexicon0.txt +fi + +if [ $stage -le 8 ]; then + echo "$0: Apply the ref_lexicon_edits file to the reference lexicon." + echo " ... The user can inspect/modify the edits file and then re-run:" + echo " ... steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \\" + echo " ... sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon." + cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null + steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \ + sort | uniq > $dest_dict/lexicon.txt || exit 1; +fi + +echo "Lexicon learning ends successfully. Please refer to $dir/lats_iter2/log/lexicon_learning_summary.log" +echo " for a summary. The learned lexicon, whose vocab matches the target_vocab, is $dest_dict/lexicon.txt" diff --git a/egs/wsj/s5/steps/dict/merge_learned_lexicons.py b/egs/wsj/s5/steps/dict/merge_learned_lexicons.py new file mode 100755 index 00000000000..6df7eb7a744 --- /dev/null +++ b/egs/wsj/s5/steps/dict/merge_learned_lexicons.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys +import math + +def GetArgs(): + parser = argparse.ArgumentParser( + description = "Convert a learned lexicon produced by steps/dict/select_prons_greedy.sh" + "into a lexicon for OOV words (w.r.t. ref. vocab) and a human editable lexicon-edit file." + "for in-vocab words, and generate detailed summaries of the lexicon learning results" + "The inputs are a learned lexicon, an arc-stats file, and three source lexicons " + "(phonetic-decoding(PD)/G2P/ref). The outputs are: a learned lexicon for OOVs" + "(learned_lexicon_oov), and a lexicon_edits file (ref_lexicon_edits) containing" + "suggested modifications of prons, for in-vocab words.", + epilog = "See steps/dict/learn_lexicon_greedy.sh for example.") + parser.add_argument("arc_stats_file", metavar = "", type = str, + help = "File containing word-pronunciation statistics obtained from lattices; " + "each line must be ") + parser.add_argument("word_counts_file", metavar = "", type = str, + help = "File containing word counts in acoustic training data; " + "each line must be .") + parser.add_argument("ref_lexicon", metavar = "", type = str, + help = "The reference lexicon (most probably hand-derived)." + "Each line must be ") + parser.add_argument("g2p_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from G2P results." + "Each line must be ") + parser.add_argument("pd_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from phonetic decoding results." + "Each line must be ") + parser.add_argument("learned_lexicon", metavar = "", type = str, + help = "Learned lexicon." + "Each line must be ") + parser.add_argument("learned_lexicon_oov", metavar = "", type = str, + help = "Output file which is the learned lexicon for words out of the ref. vocab.") + parser.add_argument("ref_lexicon_edits", metavar = "", type = str, + help = "Output file containing human-readable & editable pronounciation info (and the" + "accept/reject decision made by our algorithm) for those words in ref. vocab," + "to which any change has been recommended. The info for each word is like:" + "------------ an 4086.0 --------------" + "R | Y | 2401.6 | AH N" + "R | Y | 640.8 | AE N" + "P | Y | 1035.5 | IH N" + "R(ef), P(hone-decoding) represents the pronunciation source" + "Y/N means the recommended decision of including this pron or not" + "and the numbers are soft counts accumulated from lattice-align-word outputs. " + "See the function WriteEditsAndSummary for more details.") + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.arc_stats_file == "-": + args.arc_stats_file_handle = sys.stdin + else: + args.arc_stats_file_handle = open(args.arc_stats_file) + args.word_counts_file_handle = open(args.word_counts_file) + args.ref_lexicon_handle = open(args.ref_lexicon) + args.g2p_lexicon_handle = open(args.g2p_lexicon) + args.pd_lexicon_handle = open(args.pd_lexicon) + args.learned_lexicon_handle = open(args.learned_lexicon) + args.learned_lexicon_oov_handle = open(args.learned_lexicon_oov, "w") + args.ref_lexicon_edits_handle = open(args.ref_lexicon_edits, "w") + + return args + +def ReadArcStats(arc_stats_file_handle): + stats = defaultdict(lambda : defaultdict(dict)) + stats_summed = defaultdict(float) + for line in arc_stats_file_handle.readlines(): + splits = line.strip().split() + + if (len(splits) == 0): + continue + + if (len(splits) < 5): + raise Exception('Invalid format of line ' + line + + ' in ' + arc_stats_file) + utt = splits[1] + start_frame = int(splits[2]) + word = splits[0] + count = float(splits[3]) + phones = splits[4:] + phones = ' '.join(phones) + stats[word][(utt, start_frame)][phones] = count + stats_summed[(word, phones)] += count + return stats, stats_summed + +def ReadWordCounts(word_counts_file_handle): + counts = {} + for line in word_counts_file_handle.readlines(): + splits = line.strip().split() + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in counts file.') + word = splits[0] + count = int(splits[1]) + counts[word] = count + return counts + +def ReadLexicon(args, lexicon_file_handle, counts): + # we're skipping any word not in counts (not seen in training data), + # cause we're only learning prons for words who have acoustic examples. + lexicon = defaultdict(set) + for line in lexicon_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + if word not in counts: + continue + phones = ' '.join(splits[1:]) + lexicon[word].add(phones) + return lexicon + +def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed): + # Note that learned_lexicon and ref_lexicon are dicts of sets of prons, while the other two lexicons are sets of (word, pron) pairs. + threshold = 2 + words = [defaultdict(set) for i in range(4)] # "words" contains four bins, where we + # classify each word into, according to whether it's count > threshold, + # and whether it's OOVs w.r.t the reference lexicon. + + src = {} + print("# Note: This file contains pronunciation info for words who have candidate " + "prons from G2P/phonetic-decoding accepted in the learned lexicon" + ", sorted by their counts in acoustic training data, " + ,file=args.ref_lexicon_edits_handle) + print("# 1st Col: source of the candidate pron: G(2P) / P(hone-decoding) / R(eference)." + ,file=args.ref_lexicon_edits_handle) + print("# 2nd Col: accepted or not in the learned lexicon (Y/N).", file=args.ref_lexicon_edits_handle) + print("# 3rd Col: soft counts from lattice-alignment (not augmented by prior-counts)." + ,file=args.ref_lexicon_edits_handle) + print("# 4th Col: the pronunciation cadidate.", file=args.ref_lexicon_edits_handle) + + # words which are to be printed into the edits file. + words_to_edit = [] + num_prons_tot = 0 + for word in learned_lexicon: + num_prons_tot += len(learned_lexicon[word]) + count = len(stats[word]) # This count could be smaller than the count read from the dict "counts", + # since in each sub-utterance, multiple occurences (which is rare) of the same word are compressed into one. + # We use this count here so that in the edit-file, soft counts for each word sum up to one. + flags = ['0' for i in range(3)] # "flags" contains three binary indicators, + # indicating where this word's pronunciations come from. + for pron in learned_lexicon[word]: + if word in pd_lexicon and pron in pd_lexicon[word]: + flags[0] = '1' + src[(word, pron)] = 'P' + elif word in ref_lexicon and pron in ref_lexicon[word]: + flags[1] = '1' + src[(word, pron)] = 'R' + elif word in g2p_lexicon and pron in g2p_lexicon[word]: + flags[2] = '1' + src[(word, pron)] = 'G' + if word in ref_lexicon: + all_ref_prons_accepted = True + for pron in ref_lexicon[word]: + if pron not in learned_lexicon[word]: + all_ref_prons_accepted = False + break + if not all_ref_prons_accepted or flags[0] == '1' or flags[2] == '1': + words_to_edit.append((word, len(stats[word]))) + if count > threshold: + words[0][flags[0] + flags[1] + flags[2]].add(word) + else: + words[1][flags[0] + flags[1] + flags[2]].add(word) + else: + if count > threshold: + words[2][flags[0] + flags[2]].add(word) + else: + words[3][flags[0] + flags[2]].add(word) + + words_to_edit_sorted = sorted(words_to_edit, key=lambda entry: entry[1], reverse=True) + for word, count in words_to_edit_sorted: + print("------------",word, "%2.1f" % count, "--------------", file=args.ref_lexicon_edits_handle) + learned_prons = [] + for pron in learned_lexicon[word]: + learned_prons.append((src[(word, pron)], 'Y', stats_summed[(word, pron)], pron)) + for pron in ref_lexicon[word]: + if pron not in learned_lexicon[word]: + learned_prons.append(('R', 'N', stats_summed[(word, pron)], pron)) + learned_prons_sorted = sorted(learned_prons, key=lambda item: item[2], reverse=True) + for item in learned_prons_sorted: + print('{} | {} | {:.2f} | {}'.format(item[0], item[1], item[2], item[3]), file=args.ref_lexicon_edits_handle) + + num_oovs_with_acoustic_evidence = len(set(learned_lexicon.keys()).difference(set(ref_lexicon.keys()))) + num_oovs = len(set(counts.keys()).difference(set(ref_lexicon.keys()))) + num_ivs = len(learned_lexicon) - num_oovs_with_acoustic_evidence + print("Average num. prons per word in the learned lexicon is {}".format(float(num_prons_tot)/float(len(learned_lexicon))), file=sys.stderr) + # print("Here are the words whose reference pron candidates were all declined", words[0]['100'], file=sys.stderr) + print("-------------------------------------------------Summary------------------------------------------", file=sys.stderr) + print("We have acoustic evidence for {} out of {} in-vocab (w.r.t the reference lexicon) words from the acoustic training data.".format(num_ivs, len(ref_lexicon)), file=sys.stderr) + print(" Among those frequent words whose counts in the training text > ", threshold, ":", file=sys.stderr) + num_freq_ivs_from_all_sources = len(words[0]['111']) + len(words[0]['110']) + len(words[0]['011']) + num_freq_ivs_from_g2p_or_phonetic_decoding = len(words[0]['101']) + len(words[0]['001']) + len(words[0]['100']) + num_freq_ivs_from_ref = len(words[0]['010']) + num_infreq_ivs_from_all_sources = len(words[1]['111']) + len(words[1]['110']) + len(words[1]['011']) + num_infreq_ivs_from_g2p_or_phonetic_decoding = len(words[1]['101']) + len(words[1]['001']) + len(words[1]['100']) + num_infreq_ivs_from_ref = len(words[1]['010']) + print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_freq_ivs_from_all_sources), file=sys.stderr) + print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_freq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon only.'.format(num_freq_ivs_from_ref), file=sys.stderr) + print(' For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_infreq_ivs_from_all_sources), file=sys.stderr) + print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_infreq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon only.'.format(num_infreq_ivs_from_ref), file=sys.stderr) + print("---------------------------------------------------------------------------------------------------", file=sys.stderr) + num_freq_oovs_from_both_sources = len(words[2]['11']) + num_freq_oovs_from_phonetic_decoding = len(words[2]['10']) + num_freq_oovs_from_g2p = len(words[2]['01']) + num_infreq_oovs_from_both_sources = len(words[3]['11']) + num_infreq_oovs_from_phonetic_decoding = len(words[3]['10']) + num_infreq_oovs_from_g2p = len(words[3]['01']) + print('We have acoustic evidence for {} out of {} OOV (w.r.t the reference lexicon) words from the acoustic training data.'.format(num_oovs_with_acoustic_evidence, num_oovs), file=sys.stderr) + print(' Among those words whose counts in the training text > {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_freq_oovs_from_both_sources), file=sys.stderr) + print(' {} words\' selected prons came from phonetic decoding only.'.format(num_freq_oovs_from_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from G2P only.'.format(num_freq_oovs_from_g2p), file=sys.stderr) + print(' For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_infreq_oovs_from_both_sources), file=sys.stderr) + print(' {} words\' selected prons came from phonetic decoding only.'.format(num_infreq_oovs_from_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) + +def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle): + for word, prons in learned_lexicon.iteritems(): + if word not in ref_lexicon: + for pron in prons: + print('{0} {1}'.format(word, pron), file=file_handle) + file_handle.close() + +def Main(): + args = GetArgs() + + # Read in three lexicon sources, word counts, and pron stats. + counts = ReadWordCounts(args.word_counts_file_handle) + ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts) + g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts) + pd_lexicon = ReadLexicon(args, args.pd_lexicon_handle, counts) + stats, stats_summed = ReadArcStats(args.arc_stats_file_handle) + learned_lexicon = ReadLexicon(args, args.learned_lexicon_handle, counts) + + # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov. + WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, args.learned_lexicon_oov_handle) + # Edits will be printed into ref_lexicon_edits, and the summary will be printed into stderr. + WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/prons_to_lexicon.py b/egs/wsj/s5/steps/dict/prons_to_lexicon.py index 2a87d172602..37d7810411b 100755 --- a/egs/wsj/s5/steps/dict/prons_to_lexicon.py +++ b/egs/wsj/s5/steps/dict/prons_to_lexicon.py @@ -6,6 +6,7 @@ # we're using python 3.x style print but want it to work in python 2.x, from __future__ import print_function +from collections import defaultdict import argparse import sys @@ -21,15 +22,15 @@ def __call__(self, parser, namespace, values, option_string=None): raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) def GetArgs(): - parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phone level decoding) " - "into a lexicon for lexicon learning. We prune the pronunciations " + parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phonetic decoding or g2p) " + "into a lexicon for. We prune the pronunciations " "based on a provided stats file, and optionally filter out entries which are present " "in a filter lexicon.", epilog = "e.g. steps/dict/prons_to_lexicon.py --min-prob=0.4 \\" "--filter-lexicon=exp/tri3_lex_0.4_work/phone_decode/filter_lexicon.txt \\" "exp/tri3_lex_0.4_work/phone_decode/prons.txt \\" "exp/tri3_lex_0.4_work/lexicon_phone_decoding.txt" - "See steps/dict/learn_lexicon.sh for examples in detail.") + "See steps/dict/learn_lexicon_greedy.sh for examples in detail.") parser.add_argument("--set-sum-to-one", type = str, default = False, action = StrToBoolAction, choices = ["true", "false"], @@ -39,6 +40,8 @@ def GetArgs(): action = StrToBoolAction, choices = ["true", "false"], help = "If normalize lexicon such that the max " "probability is 1.") + parser.add_argument("--top-N", type = int, default = 0, + help = "If non-zero, we just take the top N pronunciations (according to stats/pron-probs) for each word.") parser.add_argument("--min-prob", type = float, default = 0.1, help = "Remove pronunciation with probabilities less " "than this value after normalization.") @@ -46,8 +49,7 @@ def GetArgs(): help = "Exclude entries in this filter lexicon from the output lexicon." "each line must be ") parser.add_argument("stats_file", metavar='', type = str, - help = "Input file containing pronunciation statistics, representing how many times " - "each word-pronunciation appear in the phonetic decoding results." + help = "Input lexicon file containing pronunciation statistics/probs in the first column." "each line must be ") parser.add_argument("out_lexicon", metavar='', type = str, help = "Output lexicon.") @@ -150,6 +152,18 @@ def NormalizeLexicon(lexicon, set_max_to_one = True, prob = 0 lexicon[entry] = prob +def TakeTopN(lexicon, top_N): + lexicon_reshaped = defaultdict(list) + lexicon_pruned = {} + for entry, prob in lexicon.iteritems(): + lexicon_reshaped[entry[0]].append([entry[1], prob]) + for word in lexicon_reshaped: + prons = lexicon_reshaped[word] + sorted_prons = sorted(prons, reverse=True, key=lambda prons: prons[1]) + for i in range(len(sorted_prons)): + if i >= top_N: + lexicon[(word, sorted_prons[i][0])] = 0 + def WriteLexicon(args, lexicon, filter_lexicon): words = set() num_removed = 0 @@ -179,10 +193,15 @@ def Main(): word_probs = ConvertWordCountsToProbs(args, lexicon, word_count) lexicon = ConvertWordProbsToLexicon(word_probs) - filter_lexicon = ReadLexicon(args.filter_lexicon_handle) - NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one, - set_sum_to_one = args.set_sum_to_one, - min_prob = args.min_prob) + filter_lexicon = set() + if args.filter_lexicon is not '': + filter_lexicon = ReadLexicon(args.filter_lexicon_handle) + if args.top_N > 0: + TakeTopN(lexicon, args.top_N) + else: + NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one, + set_sum_to_one = args.set_sum_to_one, + min_prob = args.min_prob) WriteLexicon(args, lexicon, filter_lexicon) args.out_lexicon_handle.close() diff --git a/egs/wsj/s5/steps/dict/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/prune_pron_candidates.py index affc5b17705..cd90a389a7c 100755 --- a/egs/wsj/s5/steps/dict/prune_pron_candidates.py +++ b/egs/wsj/s5/steps/dict/prune_pron_candidates.py @@ -4,6 +4,7 @@ # Apache 2.0. from __future__ import print_function +from __future__ import division from collections import defaultdict import argparse import sys @@ -16,7 +17,7 @@ def GetArgs(): "(For words in the reference lexicon, N = # pron variants given by the reference" "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)." "r is a user-specified constant, like 2.", - epilog = "See steps/dict/learn_lexicon.sh for example") + epilog = "See steps/dict/learn_lexicon_greedy.sh for example") parser.add_argument("--r", type = float, default = "2.0", help = "a user-specified ratio parameter which determines how many" @@ -61,7 +62,7 @@ def ReadStats(pron_stats_handle): phones = ' '.join(splits[2:]) stats[word].append((phones, count)) - for word, entry in stats.iteritems(): + for word, entry in stats.items(): entry.sort(key=lambda x: x[1]) return stats @@ -86,12 +87,12 @@ def PruneProns(args, stats, ref_lexicon): # Compute the average # pron variants counts per word in the reference lexicon. num_words_ref = 0 num_prons_ref = 0 - for word, prons in ref_lexicon.iteritems(): + for word, prons in ref_lexicon.items(): num_words_ref += 1 num_prons_ref += len(prons) avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref)) - for word, entry in stats.iteritems(): + for word, entry in stats.items(): if word in ref_lexicon: variants_counts = args.r * len(ref_lexicon[word]) else: @@ -105,7 +106,7 @@ def PruneProns(args, stats, ref_lexicon): except IndexError: break - for word, entry in stats.iteritems(): + for word, entry in stats.items(): for pron, prob in entry: if word not in ref_lexicon or pron not in ref_lexicon[word]: print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle) diff --git a/egs/wsj/s5/steps/dict/select_prons_bayesian.py b/egs/wsj/s5/steps/dict/select_prons_bayesian.py index e728a4af0b8..893dd7cb818 100755 --- a/egs/wsj/s5/steps/dict/select_prons_bayesian.py +++ b/egs/wsj/s5/steps/dict/select_prons_bayesian.py @@ -4,6 +4,7 @@ # Apache 2.0. from __future__ import print_function +from __future__ import division from collections import defaultdict import argparse import sys @@ -23,7 +24,7 @@ def GetArgs(): "a learned lexicon for words out of the ref. vocab (learned_lexicon_oov)," "and a lexicon_edits file containing suggested modifications of prons, for" "words within the ref. vocab (ref_lexicon_edits).", - epilog = "See steps/dict/learn_lexicon.sh for example.") + epilog = "See steps/dict/learn_lexicon_bayesian.sh for example.") parser.add_argument("--prior-mean", type = str, default = "0,0,0", help = "Mean of priors (summing up to 1) assigned to three exclusive n" "pronunciatio sources: reference lexicon, g2p, and phonetic decoding. We " @@ -162,7 +163,7 @@ def FilterPhoneticDecodingLexicon(args, phonetic_decoding_lexicon, stats): for line in args.silence_file_handle: silphones.add(line.strip()) rejected_candidates = set() - for word, prons in phonetic_decoding_lexicon.iteritems(): + for word, prons in phonetic_decoding_lexicon.items(): for pron in prons: for phone in pron.split(): if phone in silphones: @@ -194,7 +195,7 @@ def ComputePriorCounts(args, counts, ref_lexicon, g2p_lexicon, phonetic_decoding prior_mean[2] = 0 prior_mean_sum = sum(prior_mean) try: - prior_mean = [t / prior_mean_sum for t in prior_mean] + prior_mean = [float(t) / prior_mean_sum for t in prior_mean] except ZeroDivisionError: print('WARNING: word {} appears in train_counts but not in any lexicon.'.format(word), file=sys.stderr) prior_counts[word] = [t * args.prior_counts_tot for t in prior_mean] @@ -206,20 +207,20 @@ def ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_l # The soft-counts were augmented by a user-specified prior count, according the source # (ref/G2P/phonetic-decoding) of this pronunciation. - for word, prons in ref_lexicon.iteritems(): + for word, prons in ref_lexicon.items(): for pron in prons: # c is the augmented soft count (observed count + prior count) - c = prior_counts[word][0] / len(ref_lexicon[word]) + stats.get((word, pron), 0) + c = float(prior_counts[word][0]) / len(ref_lexicon[word]) + stats.get((word, pron), 0) posteriors[word].append((pron, c)) - for word, prons in g2p_lexicon.iteritems(): + for word, prons in g2p_lexicon.items(): for pron in prons: - c = prior_counts[word][1] / len(g2p_lexicon[word]) + stats.get((word, pron), 0) + c = float(prior_counts[word][1]) / len(g2p_lexicon[word]) + stats.get((word, pron), 0) posteriors[word].append((pron, c)) - for word, prons in phonetic_decoding_lexicon.iteritems(): + for word, prons in phonetic_decoding_lexicon.items(): for pron in prons: - c = prior_counts[word][2] / len(phonetic_decoding_lexicon[word]) + stats.get((word, pron), 0) + c = float(prior_counts[word][2]) / len(phonetic_decoding_lexicon[word]) + stats.get((word, pron), 0) posteriors[word].append((pron, c)) num_prons_from_ref = sum(len(ref_lexicon[i]) for i in ref_lexicon) @@ -239,10 +240,10 @@ def ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_l # each entry is a pair: (prounciation, count) count_sum[word] = sum([entry[1] for entry in posteriors[word]]) - for word, entry in posteriors.iteritems(): + for word, entry in posteriors.items(): new_entry = [] for pron, count in entry: - post = count / count_sum[word] + post = float(count) / count_sum[word] new_entry.append((pron, post)) source = 'R' if word in g2p_lexicon and pron in g2p_lexicon[word]: @@ -260,7 +261,7 @@ def SelectPronsBayesian(args, counts, posteriors, ref_lexicon, g2p_lexicon, phon phonetic_decoding_selected = 0 learned_lexicon = defaultdict(set) - for word, entry in posteriors.iteritems(): + for word, entry in posteriors.items(): num_variants = 0 post_tot = 0.0 variants_counts = args.variants_counts @@ -411,7 +412,7 @@ def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, phonetic_decoding_l print(' {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle): - for word, prons in learned_lexicon.iteritems(): + for word, prons in learned_lexicon.items(): if word not in ref_lexicon: for pron in prons: print('{0} {1}'.format(word, pron), file=file_handle) diff --git a/egs/wsj/s5/steps/dict/select_prons_greedy.py b/egs/wsj/s5/steps/dict/select_prons_greedy.py new file mode 100755 index 00000000000..cf71070e134 --- /dev/null +++ b/egs/wsj/s5/steps/dict/select_prons_greedy.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys +import math + +def GetArgs(): + parser = argparse.ArgumentParser( + description = "Use a greedy framework to select pronunciation candidates" + "from three sources: a reference lexicon, G2P lexicon and phonetic-decoding" + "(PD) lexicon. Basically, this script implements the Alg. 1 in the paper:" + "Acoustic data-driven lexicon learning based on a greedy pronunciation " + "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur," + "Interspeech 2017. The inputs are an arc-stats file, containing " + "acoustic evidence (tau_{uwb} in the paper) and three source lexicons " + "(phonetic-decoding(PD)/G2P/ref). The outputs is the learned lexicon for" + "all words in the arc_stats (acoustic evidence) file.", + epilog = "See steps/dict/learn_lexicon_greedy.sh for example.") + parser.add_argument("--alpha", type = str, default = "0,0,0", + help = "Scaling factors for the likelihood reduction threshold." + "of three pronunciaiton candidate sources: phonetic-decoding (PD)," + "G2P and reference. The valid range of each dimension is [0, 1], and" + "a large value means we prune pronunciations from this source more" + "aggressively. Setting a dimension to zero means we never want to remove" + "pronunciaiton from that source. See Section 4.3 in the paper for details.") + parser.add_argument("--beta", type = str, default = "0,0,0", + help = "smoothing factors for the likelihood reduction term." + "of three pronunciaiton candidate sources: phonetic-decoding (PD)," + "G2P and reference. The valid range of each dimension is [0, 100], and" + "a large value means we prune pronunciations from this source more" + "aggressively. See Section 4.3 in the paper for details.") + parser.add_argument("--delta", type = float, default = 0.000000001, + help = "Floor value of the pronunciation posterior statistics." + "The valid range is (0, 0.01)," + "See Section 3 in the paper for details.") + parser.add_argument("silence_phones_file", metavar = "", type = str, + help = "File containing a list of silence phones.") + parser.add_argument("arc_stats_file", metavar = "", type = str, + help = "File containing word-pronunciation statistics obtained from lattices; " + "each line must be ") + parser.add_argument("word_counts_file", metavar = "", type = str, + help = "File containing word counts in acoustic training data; " + "each line must be .") + parser.add_argument("ref_lexicon", metavar = "", type = str, + help = "The reference lexicon (most probably hand-derived)." + "Each line must be ") + parser.add_argument("g2p_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from G2P results." + "Each line must be ") + parser.add_argument("pd_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from phonetic decoding results." + "Each line must be ") + parser.add_argument("learned_lexicon", metavar = "", type = str, + help = "Learned lexicon.") + + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + args.silence_phones_file_handle = open(args.silence_phones_file) + if args.arc_stats_file == "-": + args.arc_stats_file_handle = sys.stdin + else: + args.arc_stats_file_handle = open(args.arc_stats_file) + args.word_counts_file_handle = open(args.word_counts_file) + args.ref_lexicon_handle = open(args.ref_lexicon) + args.g2p_lexicon_handle = open(args.g2p_lexicon) + args.pd_lexicon_handle = open(args.pd_lexicon) + args.learned_lexicon_handle = open(args.learned_lexicon, "w") + + alpha = args.alpha.strip().split(',') + if len(alpha) is not 3: + raise Exception('Invalid alpha ', args.alpha) + for i in range(0,3): + if float(alpha[i]) < 0 or float(alpha[i]) > 1: + raise Exception('alaph ', alpha[i], + ' is invalid, it must be within [0, 1].') + if float(alpha[i]) == 0: + alpha[i] = -1e-3 + # The absolute likelihood loss (search for loss_abs) is supposed to be positive. + # But it could be negative near zero because of numerical precision limit. + # In this case, even if alpha is set to be zero, which means we never want to + # remove pronunciation from that source, the quality score (search for q_b) + # could still be negative, which means this pron could be potentially removed. + # To prevent this, we set alpha as a negative value near zero to ensure + # q_b is always positive. + + args.alpha = [float(alpha[0]), float(alpha[1]), float(alpha[2])] + print("[alpha_{pd}, alpha_{g2p}, alpha_{ref}] is: ", args.alpha) + exit + beta = args.beta.strip().split(',') + if len(beta) is not 3: + raise Exception('Invalid beta ', args.beta) + for i in range(0,3): + if float(beta[i]) < 0 or float(beta[i]) > 100: + raise Exception('beta ', beta[i], + ' is invalid, it must be within [0, 100].') + args.beta = [float(beta[0]), float(beta[1]), float(beta[2])] + print("[beta_{pd}, beta_{g2p}, beta_{ref}] is: ", args.beta) + + if args.delta <= 0 or args.delta > 0.1: + raise Exception('delta ', args.delta, ' is invalid, it must be within' + '(0, 0.01).') + print("delta is: ", args.delta) + + return args + +def ReadArcStats(arc_stats_file_handle): + stats = defaultdict(lambda : defaultdict(dict)) + stats_summed = defaultdict(float) + for line in arc_stats_file_handle.readlines(): + splits = line.strip().split() + + if (len(splits) == 0): + continue + + if (len(splits) < 5): + raise Exception('Invalid format of line ' + line + + ' in ' + arc_stats_file) + utt = splits[1] + start_frame = int(splits[2]) + word = splits[0] + count = float(splits[3]) + phones = splits[4:] + phones = ' '.join(phones) + stats[word][(utt, start_frame)][phones] = count + stats_summed[(word, phones)] += count + return stats, stats_summed + +def ReadWordCounts(word_counts_file_handle): + counts = {} + for line in word_counts_file_handle.readlines(): + splits = line.strip().split() + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in counts file.') + word = splits[0] + count = int(splits[1]) + counts[word] = count + return counts + +def ReadLexicon(args, lexicon_file_handle, counts): + # we're skipping any word not in counts (not seen in training data), + # cause we're only learning prons for words who have acoustic examples. + lexicon = defaultdict(set) + for line in lexicon_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + if word not in counts: + continue + phones = ' '.join(splits[1:]) + lexicon[word].add(phones) + return lexicon + +def FilterPhoneticDecodingLexicon(args, pd_lexicon): + # We want to remove all candidates which contain silence phones + silphones = set() + for line in args.silence_phones_file_handle: + silphones.add(line.strip()) + rejected_candidates = set() + for word, prons in pd_lexicon.iteritems(): + for pron in prons: + for phone in pron.split(): + if phone in silphones: + rejected_candidates.add((word, pron)) + break + for word, pron in rejected_candidates: + pd_lexicon[word].remove(pron) + return pd_lexicon + +# One iteration of Expectation-Maximization computation (Eq. 3-4 in the paper). +def OneEMIter(args, word, stats, prons, pron_probs, debug=False): + prob_acc = [0.0 for i in range(len(prons[word]))] + s = sum(pron_probs) + for i in range(len(pron_probs)): + pron_probs[i] = pron_probs[i] / s + log_like = 0.0 + for (utt, start_frame) in stats[word]: + prob = [] + soft_counts = [] + for i in range(len(prons[word])): + phones = prons[word][i] + soft_count = stats[word][(utt, start_frame)].get(phones, 0) + if soft_count < args.delta: + soft_count = args.delta + soft_counts.append(soft_count) + prob = [i[0] * i[1] for i in zip(soft_counts, pron_probs)] + for i in range(len(prons[word])): + prob_acc[i] += prob[i] / sum(prob) + log_like += math.log(sum(prob)) + pron_probs = [1.0 / float(len(stats[word])) * p for p in prob_acc] + log_like = 1.0 / float(len(stats[word])) * log_like + if debug: + print("Log_like of the word: ", log_like, "pron probs: ", pron_probs) + return pron_probs, log_like + +def SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon, dianostic_info=False): + prons = defaultdict(list) # Put all possible prons from three source lexicons into this dictionary + src = {} # Source of each (word, pron) pair: 'P' = phonetic-decoding, 'G' = G2P, 'R' = reference + learned_lexicon = defaultdict(set) # Put all selected prons in this dictionary + for lexicon in ref_lexicon, g2p_lexicon, pd_lexicon: + for word in lexicon: + for pron in lexicon[word]: + prons[word].append(pron) + for word in prons: + for pron in prons[word]: + if word in pd_lexicon and pron in pd_lexicon[word]: + src[(word, pron)] = 'P' + if word in g2p_lexicon and pron in g2p_lexicon[word]: + src[(word, pron)] = 'G' + if word in ref_lexicon and pron in ref_lexicon[word]: + src[(word, pron)] = 'R' + + for word in prons: + if word not in stats: + continue + n = len(prons[word]) + pron_probs = [1/float(n) for i in range(n)] + if dianostic_info: + print("pronunciations of word '{}': {}".format(word, prons[word])) + active_indexes = set(range(len(prons[word]))) + + deleted_prons = [] # indexes of prons to be deleted + soft_counts_normalized = [] + while len(active_indexes) > 1: + log_like = 1.0 + log_like_last = -1.0 + num_iters = 0 + while abs(log_like - log_like_last) > 1e-7: + num_iters += 1 + log_like_last = log_like + pron_probs, log_like = OneEMIter(args, word, stats, prons, pron_probs, False) + if log_like_last == 1.0 and len(soft_counts_normalized) == 0: # the first iteration + soft_counts_normalized = pron_probs + if dianostic_info: + print("Avg.(over all egs) soft counts: {}".format(soft_counts_normalized)) + if dianostic_info: + print("\n Log_like after {} iters of EM: {}, estimated pron_probs: {} \n".format( + num_iters, log_like, pron_probs)) + candidates_to_delete = [] + + for i in active_indexes: + pron_probs_mod = [p for p in pron_probs] + pron_probs_mod[i] = 0.0 + for j in range(len(pron_probs_mod)): + if j in active_indexes and j != i: + pron_probs_mod[j] += 0.01 + pron_probs_mod = [s / sum(pron_probs_mod) for s in pron_probs_mod] + log_like2 = 1.0 + log_like2_last = -1.0 + num_iters2 = 0 + # Running EM until convengence + while abs(log_like2 - log_like2_last) > 0.001 : + num_iters2 += 1 + log_like2_last = log_like2 + pron_probs_mod, log_like2 = OneEMIter(args, word, stats, + prons, pron_probs_mod, False) + + loss_abs = log_like - log_like2 # absolute likelihood loss before normalization + # (supposed to be positive, but could be negative near zero because of numerical precision limit). + log_delta = math.log(args.delta) + thr = -log_delta + loss = loss_abs + source = src[(word, prons[word][i])] + if dianostic_info: + print("\n set the pron_prob of '{}' whose source is {}, to zero results in {}" + " loss in avg. log-likelihood; Num. iters until converging:{}. ".format( + prons[word][i], source, loss, num_iters2)) + # Compute quality score q_b = loss_abs * / (M_w + beta_s(b)) + alpha_s(b) * log_delta + # See Sec. 4.3 and Alg. 1 in the paper. + if source == 'P': + thr *= args.alpha[0] + loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[0]) + if source == 'G': + thr *= args.alpha[1] + loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[1]) + if source == 'R': + thr *= args.alpha[2] + loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[2]) + if loss - thr < 0: # loss - thr here is just q_b + if dianostic_info: + print("Smoothed log-like loss {} is smaller than threshold {} so that the quality" + "score {} is negative, adding the pron to the list of candidates to delete" + ". ".format(loss, thr, loss-thr)) + candidates_to_delete.append((loss-thr, i)) + if len(candidates_to_delete) == 0: + break + candidates_to_delete_sorted = sorted(candidates_to_delete, + key=lambda candidates_to_delete: candidates_to_delete[0]) + + deleted_candidate = candidates_to_delete_sorted[0] + active_indexes.remove(deleted_candidate[1]) + pron_probs[deleted_candidate[1]] = 0.0 + for i in range(len(pron_probs)): + if i in active_indexes: + pron_probs[i] += 0.01 + pron_probs = [s / sum(pron_probs) for s in pron_probs] + source = src[(word, prons[word][deleted_candidate[1]])] + pron = prons[word][deleted_candidate[1]] + soft_count = soft_counts_normalized[deleted_candidate[1]] + quality_score = deleted_candidate[0] + # This part of diagnostic info provides hints to the user on how to adjust the parameters. + if dianostic_info: + print("removed pron {}, from source {} with quality score {:.5f}".format( + pron, source, quality_score)) + if (source == 'P' and soft_count > 0.7 and len(stats[word]) > 5): + print("WARNING: alpha_{pd} or beta_{pd} may be too large!" + " For the word '{}' whose count is {}, the candidate " + " pronunciation from phonetic decoding '{}' with normalized " + " soft count {} (out of 1) is rejected. It shouldn't have been" + " rejected if alpha_{pd} is smaller than {}".format( + word, len(stats[word]), pron, soft_count, -loss / log_delta, + -args.alpha[0] * len(stats[word]) + (objf_change + args.beta[0])), + file=sys.stderr) + if loss_abs > thr: + print(" or beta_{pd} is smaller than {}".format( + (loss_abs / thr - 1) * len(stats[word])), file=sys.stderr) + if (source == 'G' and soft_count > 0.7 and len(stats[word]) > 5): + print("WARNING: alpha_{g2p} or beta_{g2p} may be too large!" + " For the word '{}' whose count is {}, the candidate " + " pronunciation from G2P '{}' with normalized " + " soft count {} (out of 1) is rejected. It shouldn't have been" + " rejected if alpha_{g2p} is smaller than {} ".format( + word, len(stats[word]), pron, soft_count, -loss / log_delta, + -args.alpha[1] * len(stats[word]) + (objf_change + args.beta[1])), + file=sys.stderr) + if loss_abs > thr: + print(" or beta_{g2p} is smaller than {}.".format(( + loss_abs / thr - 1) * len(stats[word])), file=sys.stderr) + deleted_prons.append(deleted_candidate[1]) + for i in range(len(prons[word])): + if i not in deleted_prons: + learned_lexicon[word].add(prons[word][i]) + + return learned_lexicon + +def WriteLearnedLexicon(learned_lexicon, file_handle): + for word, prons in learned_lexicon.iteritems(): + for pron in prons: + print('{0} {1}'.format(word, pron), file=file_handle) + file_handle.close() + +def Main(): + args = GetArgs() + + # Read in three lexicon sources, word counts, and pron stats. + counts = ReadWordCounts(args.word_counts_file_handle) + ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts) + g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts) + pd_lexicon = ReadLexicon(args, args.pd_lexicon_handle, counts) + stats, stats_summed = ReadArcStats(args.arc_stats_file_handle) + pd_lexicon = FilterPhoneticDecodingLexicon(args, pd_lexicon) + + # Select prons to construct the learned lexicon. + learned_lexicon = SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon) + + # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov. + WriteLearnedLexicon(learned_lexicon, args.learned_lexicon_handle) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py index 503721c23d1..6bf0ea4932c 100644 --- a/egs/wsj/s5/steps/libs/common.py +++ b/egs/wsj/s5/steps/libs/common.py @@ -10,6 +10,7 @@ """ from __future__ import print_function +from __future__ import division import argparse import logging import math @@ -316,7 +317,7 @@ def read_kaldi_matrix(matrix_file): 'matrix_file' and stores it as a list of rows, where each row is a list. """ try: - lines = map(lambda x: x.split(), open(matrix_file).readlines()) + lines = [x.split() for x in open(matrix_file).readlines()] first_field = lines[0][0] last_field = lines[-1][-1] lines[0] = lines[0][1:] @@ -326,7 +327,7 @@ def read_kaldi_matrix(matrix_file): "Kaldi matrix file has incorrect format, " "only text format matrix files can be read by this script") for i in range(len(lines)): - lines[i] = map(lambda x: int(float(x)), lines[i]) + lines[i] = [int(float(x)) for x in lines[i]] return lines except IOError: raise Exception("Error while reading the kaldi matrix file " @@ -348,7 +349,7 @@ def write_kaldi_matrix(output_file, matrix): if num_cols != len(matrix[row_index]): raise Exception("All the rows of a matrix are expected to " "have the same length") - f.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + f.write(" ".join([str(x) for x in matrix[row_index]])) if row_index != num_rows - 1: f.write("\n") f.write(" ]") @@ -508,7 +509,7 @@ def compute_idct_matrix(K, N, cepstral_lifter=0): lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, K) for k in range(0, K): for n in range(0, N): - matrix[n][k] = matrix[n][k] / lifter_coeffs[k] + matrix[n][k] = float(matrix[n][k]) / lifter_coeffs[k] return matrix diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index 1afc26ff163..97da5e04962 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -322,7 +322,7 @@ def parse_progress_logs_for_param_diff(exp_dir, pattern): groups = mat_obj.groups() iteration = groups[0] differences = parse_difference_string(groups[1]) - component_names = component_names.union(differences.keys()) + component_names = component_names.union(list(differences.keys())) progress_per_iter[int(iteration)] = differences component_names = list(component_names) @@ -435,14 +435,14 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): raise KaldiLogParseException("Could not find any lines with {k} in " " {l}".format(k=key, l=valid_prob_files)) - iters = list(set(valid_objf.keys()).intersection(train_objf.keys())) + iters = list(set(valid_objf.keys()).intersection(list(train_objf.keys()))) if not iters: raise KaldiLogParseException("Could not any common iterations with" " key {k} in both {tl} and {vl}".format( k=key, tl=train_prob_files, vl=valid_prob_files)) iters.sort() - return list(map(lambda x: (int(x), float(train_objf[x]), - float(valid_objf[x])), iters)) + return list([(int(x), float(train_objf[x]), + float(valid_objf[x])) for x in iters]) def parse_rnnlm_prob_logs(exp_dir, key='objf'): train_prob_files = "%s/log/train.*.*.log" % (exp_dir) @@ -498,14 +498,14 @@ def parse_rnnlm_prob_logs(exp_dir, key='objf'): raise KaldiLogParseException("Could not find any lines with {k} in " " {l}".format(k=key, l=valid_prob_files)) - iters = list(set(valid_objf.keys()).intersection(train_objf.keys())) + iters = list(set(valid_objf.keys()).intersection(list(train_objf.keys()))) if not iters: raise KaldiLogParseException("Could not any common iterations with" " key {k} in both {tl} and {vl}".format( k=key, tl=train_prob_files, vl=valid_prob_files)) iters.sort() - return map(lambda x: (int(x), float(train_objf[x]), - float(valid_objf[x])), iters) + return [(int(x), float(train_objf[x]), + float(valid_objf[x])) for x in iters] diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 6afb43824fd..c932a9c54f7 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -7,6 +7,8 @@ """ This is a module with methods which will be used by scripts for training of deep neural network acoustic model with chain objective. """ +from __future__ import division +from __future__ import print_function import logging import math @@ -413,8 +415,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats - lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), - range(1, num_lda_jobs + 1))) + lda_stat_files = ['{0}/{1}.lda_stats'.format(dir, x) for x in range(1, num_lda_jobs + 1)] common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index a2892a090f3..1a038cc23f2 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -7,6 +7,7 @@ """This module contains classes and methods common to training of nnet3 neural networks. """ +from __future__ import division import argparse import glob @@ -528,13 +529,13 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, presoftmax_prior_scale_power=-0.25, smooth=0.01): total = sum(pdf_counts) - average_count = total/len(pdf_counts) + average_count = float(total) / len(pdf_counts) scales = [] for i in range(len(pdf_counts)): scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power)) num_pdfs = len(pdf_counts) - scaled_counts = list(map(lambda x: x * float(num_pdfs) / sum(scales), scales)) + scaled_counts = [x * float(num_pdfs) / sum(scales) for x in scales] return scaled_counts @@ -564,7 +565,7 @@ def get_model_combine_iters(num_iters, num_epochs, in the final model-averaging phase. (note: it's a weighted average where the weights are worked out from a subset of training data.)""" - approx_iters_per_epoch_final = num_archives/num_jobs_final + approx_iters_per_epoch_final = float(num_archives) / num_jobs_final # Note: it used to be that we would combine over an entire epoch, # but in practice we very rarely would use any weights from towards # the end of that range, so we are changing it to use not @@ -581,8 +582,8 @@ def get_model_combine_iters(num_iters, num_epochs, # But if this value is > max_models_combine, then the models # are subsampled to get these many models to combine. - num_iters_combine_initial = min(approx_iters_per_epoch_final/2 + 1, - num_iters/2) + num_iters_combine_initial = min(int(approx_iters_per_epoch_final/2) + 1, + int(num_iters/2)) if num_iters_combine_initial > max_models_combine: subsample_model_factor = int( @@ -610,8 +611,7 @@ def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed, effective_learning_rate = ( initial_effective_lrate * math.exp(num_archives_processed - * math.log(final_effective_lrate - / initial_effective_lrate) + * math.log(float(final_effective_lrate) / initial_effective_lrate) / num_archives_to_process)) return num_jobs * effective_learning_rate diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index cc5c9693a12..f2722350e41 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -348,8 +348,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats - lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), - range(1, num_lda_jobs + 1))) + lda_stat_files = ['{0}/{1}.lda_stats'.format(dir, x) for x in range(1, num_lda_jobs + 1)] common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py index e870c1a60cf..db4cb392f10 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py @@ -6,6 +6,7 @@ """ from __future__ import print_function +from __future__ import division import math import re import sys diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 9a856bc6fe1..7846c983b19 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -9,6 +9,7 @@ """ from __future__ import print_function +from __future__ import division import math import re import sys diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py index be8bcaefedf..5597ff0e216 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py @@ -7,6 +7,7 @@ """ This module has the implementation of convolutional layers. """ from __future__ import print_function +from __future__ import division import math import re import sys @@ -880,7 +881,7 @@ def _generate_normal_resblock_config(self): num_filters_out = self.config['num-filters'] if height_out != height_in: - if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1: + if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1: raise RuntimeError("Expected height-out to be about half height-in, or the same: " "height-in={0} height-out={1}".format(height_in, height_out)) if not time_period_out % 2 == 0: @@ -1030,7 +1031,7 @@ def _generate_bottleneck_resblock_config(self): num_filters_out = self.config['num-filters'] if height_out != height_in: - if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1: + if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1: raise RuntimeError("Expected height-out to be about half height-in, or the same: " "height-in={0} height-out={1}".format(height_in, height_out)) height_subsample = 2 diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index ede0201f572..5ac2ed59003 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -81,6 +81,7 @@ 'linear-component': xlayers.XconfigLinearComponent, 'affine-component': xlayers.XconfigAffineComponent, 'scale-component': xlayers.XconfigPerElementScaleComponent, + 'dim-range-component': xlayers.XconfigDimRangeComponent, 'offset-component': xlayers.XconfigPerElementOffsetComponent, 'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer } diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index f91258bab04..2728ad40639 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -580,3 +580,67 @@ def _generate_config(self): self.name, input_desc)) configs.append(line) return configs + + +class XconfigDimRangeComponent(XconfigLayerBase): + """This class is for parsing lines like + 'dim-range-component name=feature1 input=Append(-3,0,3) dim=40 dim-offset=0' + which will produce just a single component, of part of the input. + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + dim=-1 [Dimension of the output.] + dim-offset=0 [Dimension offset of the input.] + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'dim': -1, + 'dim-offset': 0 } + + def check_configs(self): + input_dim = self.descriptors['input']['dim'] + if self.config['dim'] <= 0: + raise RuntimeError("'dim' must be specified and > 0.") + elif self.config['dim'] > input_dim: + raise RuntimeError("'dim' must be specified and lower than the input dim.") + if self.config['dim-offset'] < 0 : + raise RuntimeError("'dim-offset' must be specified and >= 0.") + elif self.config['dim-offset'] + self.config['dim'] > input_dim: + raise RuntimeError("'dim-offset' plus output dim must be lower than the input dim.") + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + output_dim = self.config['dim'] + if output_dim <= 0: + self.config['dim'] = self.descriptors['input']['dim'] + return output_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_node = self.descriptors['input']['final-string'] + output_dim = self.config['dim'] + dim_offset = self.config['dim-offset'] + + configs = [] + line = ('dim-range-node name={0} input-node={1} dim={2} dim-offset={3}'.format( + self.name, input_node, output_dim, dim_offset)) + configs.append(line) + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index 08de18167cd..0188248d694 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -184,7 +184,7 @@ def convert_value_to_type(key, dest_type, string_value): # Also, in any place a raw input/layer/output name can appear, we accept things # like [-1] meaning the previous input/layer/output's name, or [-2] meaning the # last-but-one input/layer/output, and so on. -class Descriptor: +class Descriptor(object): def __init__(self, descriptor_string = None, prev_names = None): @@ -595,7 +595,7 @@ def parse_config_line(orig_config_line): rest_of_line = ' '.join(fields) # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)' - positions = list(map(lambda x: x.start(), re.finditer('"', rest_of_line))) + positions = [x.start() for x in re.finditer('"', rest_of_line)] if not len(positions) % 2 == 0: raise RuntimeError("Double-quotes should occur in pairs") diff --git a/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py b/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py index 6e7bff3fa17..b5338b516e8 100755 --- a/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py +++ b/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py @@ -4,14 +4,16 @@ # Creates the nnet.config and hidde_*.config scripts used in train_pnorm_multisplice.sh # Parses the splice string to generate relevant variables for get_egs.sh, get_lda.sh and nnet/hidden.config files +from __future__ import division +from __future__ import print_function import re, argparse, sys, math, warnings # returns the set of frame indices required to perform the convolution # between sequences with frame indices in x and y def get_convolution_index_set(x, y): z = [] - for i in xrange(len(x)): - for j in xrange(len(y)): + for i in range(len(x)): + for j in range(len(y)): z.append(x[i]+y[j]) z = list(set(z)) z.sort() @@ -19,7 +21,7 @@ def get_convolution_index_set(x, y): def parse_splice_string(splice_string): layerwise_splice_indexes = splice_string.split('layer')[1:] - print splice_string.split('layer') + print(splice_string.split('layer')) contexts={} first_right_context = 0 # default value first_left_context = 0 # default value @@ -29,14 +31,14 @@ def parse_splice_string(splice_string): try: for cur_splice_indexes in layerwise_splice_indexes: layer_index, frame_indexes = cur_splice_indexes.split("/") - frame_indexes = map(lambda x: int(x), frame_indexes.split(':')) + frame_indexes = [int(x) for x in frame_indexes.split(':')] layer_index = int(layer_index) assert(layer_index >= 0) if layer_index == 0: first_left_context = min(frame_indexes) first_right_context = max(frame_indexes) try: - assert(frame_indexes == range(first_left_context, first_right_context+1)) + assert(frame_indexes == list(range(first_left_context, first_right_context+1))) except AssertionError: raise Exception('Currently the first splice component just accepts contiguous context.') try: @@ -46,11 +48,11 @@ def parse_splice_string(splice_string): left context provided is %d and right context provided is %d.""" % (first_left_context, first_right_context)) # convolve the current splice indices with the splice indices until last layer nnet_frame_indexes = get_convolution_index_set(frame_indexes, nnet_frame_indexes) - cur_context = ":".join(map(lambda x: str(x), frame_indexes)) + cur_context = ":".join([str(x) for x in frame_indexes]) contexts[layer_index] = cur_context except ValueError: raise Exception('Unknown format in splice_indexes variable: {0}'.format(params.splice_indexes)) - print nnet_frame_indexes + print(nnet_frame_indexes) max_left_context = min(nnet_frame_indexes) max_right_context = max(nnet_frame_indexes) return [contexts, ' nnet_left_context={0};\n nnet_right_context={1}\n first_left_context={2};\n first_right_context={3}\n'.format(abs(max_left_context), abs(max_right_context), abs(first_left_context), abs(first_right_context) )] @@ -87,7 +89,7 @@ def create_config_files(output_dir, params): except KeyError: raise Exception('A splice layer is expected to be the first layer. Provide a context for the first layer.') - for i in xrange(1, params.num_hidden_layers): #just run till num_hidden_layers-1 since we do not add splice before the final affine transform + for i in range(1, params.num_hidden_layers): #just run till num_hidden_layers-1 since we do not add splice before the final affine transform lines=[] context_len = 1 if i in contexts: @@ -109,7 +111,7 @@ def create_config_files(output_dir, params): if __name__ == "__main__": - print " ".join(sys.argv) + print(" ".join(sys.argv)) parser = argparse.ArgumentParser() parser.add_argument('--splice-indexes', type=str, help='string specifying the indexes for the splice layers throughout the network') parser.add_argument('--total-input-dim', type=int, help='dimension of the input to the network') @@ -127,7 +129,7 @@ def create_config_files(output_dir, params): parser.add_argument("output_dir", type=str, help="output directory to store the files") params = parser.parse_args() - print params + print(params) if params.mode == "contexts": [context, context_variables] = parse_splice_string(params.splice_indexes) var_file = open("{0}/vars".format(params.output_dir), "w") diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/text_to_phones.py b/egs/wsj/s5/steps/nnet3/chain/e2e/text_to_phones.py index 0ff05e3c48e..2c51cb57750 100755 --- a/egs/wsj/s5/steps/nnet3/chain/e2e/text_to_phones.py +++ b/egs/wsj/s5/steps/nnet3/chain/e2e/text_to_phones.py @@ -8,6 +8,7 @@ to phone transcriptions using the provided lexicon, and writes them to standard output. """ +from __future__ import print_function import argparse from os.path import join diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index a832f57cd8f..40b65afe273 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -6,6 +6,8 @@ """ This script is based on steps/nnet3/chain/train.sh """ +from __future__ import division +from __future__ import print_function import argparse import logging diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index 34443d586ca..8e879579776 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -84,7 +84,7 @@ def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks): def AddPermuteLayer(config_lines, name, input, column_map): components = config_lines['components'] component_nodes = config_lines['component-nodes'] - permute_indexes = ",".join(map(lambda x: str(x), column_map)) + permute_indexes = ",".join([str(x) for x in column_map]) components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes)) component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor'])) diff --git a/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py b/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py index f0a4341d12b..66ff633fbfc 100755 --- a/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py +++ b/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py @@ -6,6 +6,7 @@ # It requires knowledge of valid components which # can be modified in the configuration section below. +from __future__ import print_function import argparse, os, tempfile, logging, sys, shutil, fileinput, re from collections import defaultdict, namedtuple import numpy as np @@ -51,7 +52,7 @@ SPLICE_COMPONENTS = [c for c in NODE_NAMES if "Splice" in c] AFFINE_COMPONENTS = [c for c in NODE_NAMES if "Affine" in c] -KNOWN_COMPONENTS = NODE_NAMES.keys() +KNOWN_COMPONENTS = list(NODE_NAMES.keys()) # End configuration section logger = logging.getLogger(__name__) diff --git a/egs/wsj/s5/steps/nnet3/dot/descriptor_parser.py b/egs/wsj/s5/steps/nnet3/dot/descriptor_parser.py index a46d144d0b6..ee6fa11b5c9 100644 --- a/egs/wsj/s5/steps/nnet3/dot/descriptor_parser.py +++ b/egs/wsj/s5/steps/nnet3/dot/descriptor_parser.py @@ -33,7 +33,7 @@ def ParseSubsegmentsAndArguments(segment_endpoints, sub_segments, arguments, inp else: arguments.append(sub_segment_name) else: - arguments = map(lambda x: re.sub(',','', x.strip()), input_string[segment_endpoints[0]:segment_endpoints[1]+1].split()) + arguments = [re.sub(',','', x.strip()) for x in input_string[segment_endpoints[0]:segment_endpoints[1]+1].split()] sub_segments = [] return sub_segments, arguments diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py index f8cd357fa3b..4230b32aa7c 100755 --- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py +++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py @@ -189,7 +189,7 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non sub_segment = segment['sub_segments'][i] part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i) names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i)) - dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name+"_"+str(i)) + dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), "{0}_{1}".format(desc_name, i)) # link the sum node parts to corresponding segments part_index = len(segment['sub_segments']) @@ -321,7 +321,7 @@ def Nnet3ComponentToDot(component_config, component_attributes = None): label = '' if component_attributes is None: component_attributes = component_config.keys() - attributes_to_print = set(component_attributes).intersection(component_config.keys()) + attributes_to_print = set(component_attributes).intersection(list(component_config.keys())) # process the known fields for key in attributes_to_print: if key in component_config: diff --git a/egs/wsj/s5/steps/nnet3/get_successful_models.py b/egs/wsj/s5/steps/nnet3/get_successful_models.py index 3661d91b8d5..e6dcf376a51 100755 --- a/egs/wsj/s5/steps/nnet3/get_successful_models.py +++ b/egs/wsj/s5/steps/nnet3/get_successful_models.py @@ -56,7 +56,7 @@ if (loss[max_index] - loss[i]) <= args.difference_threshold: accepted_models.append(i+1) - model_list = " ".join(map(lambda x: str(x), accepted_models)) + model_list = " ".join([str(x) for x in accepted_models]) print(model_list) if len(accepted_models) != args.num_models: diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py index b80a8d4045b..8a533465f07 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py @@ -181,7 +181,7 @@ def ParseSpliceString(splice_indexes, label_delay=None): splice_array = [] try: for i in range(len(split1)): - indexes = map(lambda x: int(x), split1[i].strip().split(",")) + indexes = [int(x) for x in split1[i].strip().split(",")] print(indexes) if len(indexes) < 1: raise ValueError("invalid --splice-indexes argument, too-short element: " @@ -214,12 +214,12 @@ def ParseLstmDelayString(lstm_delay): lstm_delay_array = [] try: for i in range(len(split1)): - indexes = map(lambda x: int(x), split1[i].strip().lstrip('[').rstrip(']').strip().split(",")) + indexes = [int(x) for x in split1[i].strip().lstrip('[').rstrip(']').strip().split(",")] if len(indexes) < 1: raise ValueError("invalid --lstm-delay argument, too-short element: " + lstm_delay) elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0: - raise ValueError('Warning: ' + str(indexes) + ' is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.') + raise ValueError('Warning: {} is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.'.format(indexes)) if len(indexes) == 2 and indexes[0] > 0: # always a negative delay followed by a postive delay indexes[0], indexes[1] = indexes[1], indexes[0] lstm_delay_array.append(indexes) @@ -335,9 +335,9 @@ def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layer # write the files used by other scripts like steps/nnet3/get_egs.sh f = open(config_dir + "/vars", "w") - print('model_left_context=' + str(left_context), file=f) - print('model_right_context=' + str(right_context), file=f) - print('num_hidden_layers=' + str(num_hidden_layers), file=f) + print('model_left_context={}'.format(left_context), file=f) + print('model_right_context={}'.format(right_context), file=f) + print('num_hidden_layers={}'.format(num_hidden_layers), file=f) # print('initial_right_context=' + str(splice_array[0][-1]), file=f) f.close() diff --git a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py index 162fda16d16..d121be6d899 100644 --- a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py +++ b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py @@ -98,21 +98,21 @@ input_dim = len(splice_array[0]) * args.feat_dim + args.ivector_dim f = open(args.config_dir + "/vars", "w") -print('left_context=' + str(left_context), file=f) -print('right_context=' + str(right_context), file=f) +print('left_context={}'.format(left_context), file=f) +print('right_context={}'.format(right_context), file=f) # the initial l/r contexts are actually not needed. # print('initial_left_context=' + str(splice_array[0][0]), file=f) # print('initial_right_context=' + str(splice_array[0][-1]), file=f) -print('num_hidden_layers=' + str(num_hidden_layers), file=f) +print('num_hidden_layers={}'.format(num_hidden_layers), file=f) f.close() f = open(args.config_dir + "/init.config", "w") print('# Config file for initializing neural network prior to', file=f) print('# preconditioning matrix computation', file=f) -print('input-node name=input dim=' + str(args.feat_dim), file=f) +print('input-node name=input dim={}'.format(args.feat_dim), file=f) list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ] if args.ivector_dim > 0: - print('input-node name=ivector dim=' + str(args.ivector_dim), file=f) + print('input-node name=ivector dim={}'.format(args.ivector_dim), file=f) list.append('ReplaceIndex(ivector, t, 0)') # example of next line: # output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))" diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py index 54c65eb5403..a407869854d 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py +++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py @@ -40,7 +40,6 @@ """ -from __future__ import print_function import os, argparse, sys, random import logging import traceback @@ -163,7 +162,7 @@ def process_multilingual_egs(args): "not include any examples from this lang.") logger.info("The proportion of egs from lang {} is {:.2f}. The number of blocks " "per archive for this lang is approximately {:.2f}. " - "{}".format(lang, lang_to_num_examples[lang] / tot_num_egs, + "{}".format(lang, float(lang_to_num_examples[lang]) / tot_num_egs, blocks_per_archive_this_lang, warning)) @@ -173,11 +172,11 @@ def process_multilingual_egs(args): lang_to_num_remaining_egs = [n for n in lang_to_num_examples] for archive_index in range(num_archives + 1): # +1 is because we write to the last archive in two rounds num_remaining_archives = num_archives - archive_index - num_remaining_blocks = num_remaining_egs / args.block_size + num_remaining_blocks = float(num_remaining_egs) / args.block_size last_round = (archive_index == num_archives) if not last_round: - num_blocks_this_archive = int(round(num_remaining_blocks / num_remaining_archives)) + num_blocks_this_archive = int(round(float(num_remaining_blocks) / num_remaining_archives)) logger.info("Generating archive {} containing {} blocks...".format(archive_index, num_blocks_this_archive)) else: # This is the second round for the last archive. Flush all the remaining egs... archive_index = num_archives - 1 @@ -194,7 +193,7 @@ def process_multilingual_egs(args): for block_index in range(num_blocks_this_archive): # Find the lang with the highest proportion of remaining examples - remaining_proportions = [remain / tot for remain, tot in zip(lang_to_num_remaining_egs, lang_to_num_examples)] + remaining_proportions = [float(remain) / tot for remain, tot in zip(lang_to_num_remaining_egs, lang_to_num_examples)] lang_index, max_proportion = max(enumerate(remaining_proportions), key=lambda a: a[1]) # Read 'block_size' examples from the selected lang and write them to the current output scp file: diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index 93cbc940c33..572e2cf08b7 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -4,6 +4,7 @@ # 2016 Vimal Manohar # Apache 2.0. +from __future__ import division import argparse import errno import logging @@ -97,7 +98,7 @@ def get_args(): g_plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan'] -class LatexReport: +class LatexReport(object): """Class for writing a Latex report""" def __init__(self, pdf_file): @@ -422,7 +423,7 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None, f.write("\n".join(iter_stat_report)) f.close() if plot: - main_component_names = main_stat_tables.keys() + main_component_names = list(main_stat_tables.keys()) main_component_names.sort() plot_component_names = set(main_component_names) @@ -528,13 +529,13 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot, file = open("{dir}/clipped_proportion.log".format(dir=output_dir), "w") iter_stat_report = "" for row in main_cp_stats: - iter_stat_report += "\t".join(map(lambda x: str(x), row)) + "\n" + iter_stat_report += "\t".join([str(x) for x in row]) + "\n" file.write(iter_stat_report) file.close() if plot: main_component_names = ( - stats_per_dir[exp_dir]['cp_per_iter_per_component'].keys()) + list(stats_per_dir[exp_dir]['cp_per_iter_per_component'].keys())) main_component_names.sort() plot_component_names = set(main_component_names) for dir in dirs: @@ -635,22 +636,21 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot, except KeyError: total_missing_iterations += 1 iter_data.append("NA") - if (total_missing_iterations/len(component_names) > 20 + if (float(total_missing_iterations)/len(component_names) > 20 and not gave_user_warning): logger.warning("There are more than {0} missing " "iterations per component. " "Something might be wrong.".format( - total_missing_iterations - / len(component_names))) + float(total_missing_iterations)/ len(component_names))) gave_user_warning = True f.write(" ".join(iter_data)+"\n") if plot: # get the component names - diff_type = key_file.keys()[0] - main_component_names = stats_per_dir[exp_dir][diff_type][ - 'progress_per_component'].keys() + diff_type = list(key_file.keys())[0] + main_component_names = list(stats_per_dir[exp_dir][diff_type][ + 'progress_per_component'].keys()) main_component_names.sort() plot_component_names = set(main_component_names) diff --git a/egs/wsj/s5/steps/nnet3/report/summarize_compute_debug_timing.py b/egs/wsj/s5/steps/nnet3/report/summarize_compute_debug_timing.py index 442ca4e35cf..5c74eaf128c 100755 --- a/egs/wsj/s5/steps/nnet3/report/summarize_compute_debug_timing.py +++ b/egs/wsj/s5/steps/nnet3/report/summarize_compute_debug_timing.py @@ -7,6 +7,7 @@ # we're using python 3.x style print but want it to work in python 2.x, from __future__ import print_function +from __future__ import division import sys import re import argparse @@ -101,7 +102,7 @@ def Main(): total_time = sum(command_times.values()) sorted_commands = sorted(command_times.items(), key = lambda x: x[1], reverse = True) for item in sorted_commands: - print("{c} : time {t} : fraction {f}".format(c=item[0], t=item[1], f=item[1] / total_time)) + print("{c} : time {t} : fraction {f}".format(c=item[0], t=item[1], f=float(item[1]) / total_time)) if __name__ == "__main__": diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py index 5445b16e165..9e7e92f6768 100755 --- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py @@ -4,6 +4,7 @@ # we're using python 3.x style print but want it to work in python 2.x, from __future__ import print_function +from __future__ import division import os import argparse import shlex @@ -519,10 +520,10 @@ def MakeConfigs(config_dir, splice_indexes_string, # write the files used by other scripts like steps/nnet3/get_egs.sh f = open(config_dir + "/vars", "w") - print('model_left_context=' + str(left_context), file=f) - print('model_right_context=' + str(right_context), file=f) - print('num_hidden_layers=' + str(num_hidden_layers), file=f) - print('num_targets=' + str(num_targets), file=f) + print('model_left_context={}'.format(left_context), file=f) + print('model_right_context={}'.format(right_context), file=f) + print('num_hidden_layers={}'.format(num_hidden_layers), file=f) + print('num_targets={}'.format(num_targets), file=f) print('add_lda=' + ('true' if add_lda else 'false'), file=f) print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f) print('objective_type=' + objective_type, file=f) diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index 0c881b4dbdf..e72b29297a4 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -9,6 +9,7 @@ """ from __future__ import print_function +from __future__ import division import argparse import logging import os @@ -193,7 +194,7 @@ def train(args, run_opts): shutil.copy('{0}/tree'.format(args.ali_dir), args.dir) with open('{0}/num_jobs'.format(args.dir), 'w') as f: - f.write(str(num_jobs)) + f.write('{}'.format(num_jobs)) if args.input_model is None: config_dir = '{0}/configs'.format(args.dir) @@ -301,8 +302,7 @@ def train(args, run_opts): num_archives_expanded = num_archives * args.frames_per_eg num_archives_to_process = int(args.num_epochs * num_archives_expanded) num_archives_processed = 0 - num_iters = ((num_archives_to_process * 2) - / (args.num_jobs_initial + args.num_jobs_final)) + num_iters = int(num_archives_to_process * 2 / (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py index fc73cbc7f3f..ffccf443b99 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -9,6 +9,7 @@ """ from __future__ import print_function +from __future__ import division import argparse import logging import pprint @@ -321,8 +322,7 @@ def train(args, run_opts): num_archives_expanded = num_archives * args.frames_per_eg num_archives_to_process = int(args.num_epochs * num_archives_expanded) num_archives_processed = 0 - num_iters = int((num_archives_to_process * 2) - / (args.num_jobs_initial + args.num_jobs_final)) + num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py index e797c86b323..c704b0725d3 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py @@ -10,6 +10,7 @@ raw neural network instead of an acoustic model. """ from __future__ import print_function +from __future__ import division import argparse import logging import pprint @@ -368,8 +369,7 @@ def train(args, run_opts): # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_to_process = int(args.num_epochs * num_archives) num_archives_processed = 0 - num_iters = ((num_archives_to_process * 2) - / (args.num_jobs_initial + args.num_jobs_final)) + num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. @@ -509,7 +509,8 @@ def train(args, run_opts): run_opts=run_opts, chunk_width=args.chunk_width, get_raw_nnet_from_am=False, compute_per_dim_accuracy=args.compute_per_dim_accuracy, - max_objective_evaluations=args.max_objective_evaluations) + max_objective_evaluations=args.max_objective_evaluations, + use_multitask_egs=use_multitask_egs) else: common_lib.force_symlink("{0}.raw".format(num_iters), "{0}/final.raw".format(args.dir)) diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 25e7dced19b..ab2aa0c4d8d 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -8,6 +8,7 @@ """ from __future__ import print_function +from __future__ import division import argparse import logging import os @@ -248,7 +249,7 @@ def train(args, run_opts): shutil.copy('{0}/tree'.format(args.ali_dir), args.dir) with open('{0}/num_jobs'.format(args.dir), 'w') as f: - f.write(str(num_jobs)) + f.write('{}'.format(num_jobs)) config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) @@ -369,8 +370,7 @@ def train(args, run_opts): # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. num_archives_to_process = int(args.num_epochs * num_archives) num_archives_processed = 0 - num_iters = ((num_archives_to_process * 2) - / (args.num_jobs_initial + args.num_jobs_final)) + num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)) # If do_final_combination is True, compute the set of models_to_combine. # Otherwise, models_to_combine will be none. diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index 3b8dc82fe48..f025eb5b343 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -115,7 +115,7 @@ def write_expanded_xconfig_files(config_dir, all_layers): '# See also ./xconfig.expanded.2\n', file=xconfig_file_out) for layer in all_layers: - print(str(layer), file=xconfig_file_out) + print('{}'.format(layer), file=xconfig_file_out) xconfig_file_out.close() try: @@ -135,7 +135,7 @@ def write_expanded_xconfig_files(config_dir, all_layers): for layer in all_layers: layer.normalize_descriptors() - print(str(layer), file=xconfig_file_out) + print('{}'.format(layer), file=xconfig_file_out) xconfig_file_out.close() diff --git a/egs/wsj/s5/steps/segmentation/internal/find_oov_phone.py b/egs/wsj/s5/steps/segmentation/internal/find_oov_phone.py index 3e9cbbbf178..038640f6271 100644 --- a/egs/wsj/s5/steps/segmentation/internal/find_oov_phone.py +++ b/egs/wsj/s5/steps/segmentation/internal/find_oov_phone.py @@ -8,6 +8,7 @@ /phones/align_lexicon.int. It prints the OOV phone to stdout, if it can find a single phone mapping for the OOV word.""" +from __future__ import print_function import sys diff --git a/egs/wsj/s5/steps/segmentation/internal/get_default_targets_for_out_of_segments.py b/egs/wsj/s5/steps/segmentation/internal/get_default_targets_for_out_of_segments.py index e7000b9de00..0361999d904 100755 --- a/egs/wsj/s5/steps/segmentation/internal/get_default_targets_for_out_of_segments.py +++ b/egs/wsj/s5/steps/segmentation/internal/get_default_targets_for_out_of_segments.py @@ -14,6 +14,7 @@ the application and data, this could be [ 0 0 0 ] or [ 0 0 1 ] or something with fractional weights. """ +from __future__ import division import argparse import logging @@ -131,7 +132,7 @@ def run(args): and np.shape(default_targets)[1] == 3) with common_lib.smart_open(args.out_targets_ark, 'w') as f: - for reco, utts in reco2utt.iteritems(): + for reco, utts in reco2utt.items(): reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py index 8c53e5e8db9..e48afbeb872 100755 --- a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py +++ b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py @@ -9,6 +9,7 @@ in any of the segments are assigned the default targets vector, specified by the option --default-targets or [ 0 0 0 ] if unspecified. """ +from __future__ import division import argparse import logging @@ -158,7 +159,7 @@ def run(args): num_reco = 0 with common_lib.smart_open(args.out_targets_ark, 'w') as fh: - for reco, utts in reco2utt.iteritems(): + for reco, utts in reco2utt.items(): # Read a recording and the list of its utterances from the # reco2utt dictionary reco_mat = np.repeat(default_targets, reco2num_frames[reco], diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py index 8222eddad8f..a14aef151c2 100755 --- a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py +++ b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py @@ -17,6 +17,7 @@ """ from __future__ import print_function +from __future__ import division import argparse import logging import numpy as np diff --git a/egs/wsj/s5/steps/tfrnnlm/lstm.py b/egs/wsj/s5/steps/tfrnnlm/lstm.py index 5f175212c4b..433dc87b4c6 100644 --- a/egs/wsj/s5/steps/tfrnnlm/lstm.py +++ b/egs/wsj/s5/steps/tfrnnlm/lstm.py @@ -203,7 +203,7 @@ def attn_cell(): config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( - zip(grads, tvars), + list(zip(grads, tvars)), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder( diff --git a/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py b/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py index 440962a3780..ff6c7263804 100644 --- a/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py +++ b/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py @@ -218,7 +218,7 @@ def attn_cell(): config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( - zip(grads, tvars), + list(zip(grads, tvars)), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder( diff --git a/egs/wsj/s5/steps/tfrnnlm/reader.py b/egs/wsj/s5/steps/tfrnnlm/reader.py index fc3d4d0471c..80cdeccbb26 100644 --- a/egs/wsj/s5/steps/tfrnnlm/reader.py +++ b/egs/wsj/s5/steps/tfrnnlm/reader.py @@ -31,7 +31,7 @@ def _read_words(filename): def _build_vocab(filename): words = _read_words(filename) - word_to_id = dict(zip(words, range(len(words)))) + word_to_id = dict(list(zip(words, list(range(len(words)))))) return word_to_id diff --git a/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py b/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py index f3ce1a5c297..ae7a257906e 100644 --- a/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py +++ b/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py @@ -201,7 +201,7 @@ def attn_cell(): config.max_grad_norm) optimizer = tf.train.MomentumOptimizer(self._lr, 0.9) self._train_op = optimizer.apply_gradients( - zip(grads, tvars), + list(zip(grads, tvars)), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder( diff --git a/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py b/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py index deb8207c5b7..61c9a3014aa 100755 --- a/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py +++ b/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py @@ -17,6 +17,7 @@ """ from __future__ import print_function +from __future__ import division import argparse import collections import logging @@ -231,7 +232,7 @@ def resolve_overlaps(ctms, segments): try: index = next( (i for i, line in enumerate(ctm_for_next_utt) - if line[2] + line[3] / 2.0 > overlap / 2.0)) + if line[2] + line[3] / 2.0 > overlap / 2.0)) except StopIteration: # This can happen if there is no word hypothesized after # half the overlap region. @@ -277,7 +278,7 @@ def run(args): segments, reco2utt = read_segments(args.segments) ctms = read_ctm(args.ctm_in, segments) - for reco, utts in reco2utt.iteritems(): + for reco, utts in reco2utt.items(): ctms_for_reco = [] for utt in sorted(utts, key=lambda x: segments[x][1]): if (reco, utt) in ctms: diff --git a/egs/wsj/s5/utils/data/get_uniform_subsegments.py b/egs/wsj/s5/utils/data/get_uniform_subsegments.py index c61b96e0dbb..cc3015564a5 100755 --- a/egs/wsj/s5/utils/data/get_uniform_subsegments.py +++ b/egs/wsj/s5/utils/data/get_uniform_subsegments.py @@ -4,6 +4,7 @@ # 2017 Matthew Maciejewski # Apache 2.0. +from __future__ import print_function import argparse import logging import sys diff --git a/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py b/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py index 740b9aa612b..875c238abd5 100755 --- a/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py +++ b/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py @@ -89,7 +89,7 @@ def CombineList(min_duration, durations): # for each utterance-index i, group_start[i] gives us the # start-index of the group of utterances of which it's currently # a member. - group_start = range(num_utts) + group_start = list(range(num_utts)) # if utterance-index i currently corresponds to the start of a group # of utterances, then group_durations[i] is the total duration of # that utterance-group, otherwise undefined. @@ -327,7 +327,7 @@ def GetUtteranceGroups(min_duration, spk2utt, utt2dur): utt_groups = GetUtteranceGroups(args.min_duration, spk2utt, utt2dur) # set utt_group names to an array like [ 'utt1', 'utt2-comb2', 'utt4', ... ] -utt_group_names = [ group[0] if len(group)==1 else group[0] + "-comb" + str(len(group)) +utt_group_names = [ group[0] if len(group)==1 else "{0}-comb{1}".format(group[0], len(group)) for group in utt_groups ] diff --git a/egs/wsj/s5/utils/data/internal/perturb_volume.py b/egs/wsj/s5/utils/data/internal/perturb_volume.py index b3bd4225191..c1dfd936358 100755 --- a/egs/wsj/s5/utils/data/internal/perturb_volume.py +++ b/egs/wsj/s5/utils/data/internal/perturb_volume.py @@ -8,6 +8,7 @@ volume of the recordings and writes to stdout the contents of a new wav.scp file. """ +from __future__ import print_function import argparse import re diff --git a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py index 7924fc4fcf1..ae16e63c945 100755 --- a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py +++ b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py @@ -60,7 +60,7 @@ def get_args(): args.speed_perturb = True if args.speed_perturb == 'true' else False return args -class Utterance: +class Utterance(object): """ This class represents a Kaldi utterance in a data directory like data/train """ @@ -321,7 +321,7 @@ def main(): "Coverage rate: {}%".format(start_dur, end_dur, 100.0 - args.coverage_factor * 2)) logger.info("There will be {} unique allowed lengths " - "for the utterances.".format(int(math.log(end_dur / start_dur) / + "for the utterances.".format(int(math.log(end_dur / start_dur)/ math.log(args.factor)))) allowed_durations = find_allowed_durations(start_dur, end_dur, args) diff --git a/egs/wsj/s5/utils/filt.py b/egs/wsj/s5/utils/filt.py index 2847c0034dd..9201d9e493f 100755 --- a/egs/wsj/s5/utils/filt.py +++ b/egs/wsj/s5/utils/filt.py @@ -2,6 +2,7 @@ # Apache 2.0 +from __future__ import print_function import sys vocab=set() @@ -11,4 +12,4 @@ with open(sys.argv[2]) as textfile: for line in textfile: - print " ".join(map(lambda word: word if word in vocab else '', line.strip().split())) + print(" ".join([word if word in vocab else '' for word in line.strip().split()])) diff --git a/egs/wsj/s5/utils/lang/bpe/learn_bpe.py b/egs/wsj/s5/utils/lang/bpe/learn_bpe.py index 70f18f2d1d9..f6c6d5a0ebb 100755 --- a/egs/wsj/s5/utils/lang/bpe/learn_bpe.py +++ b/egs/wsj/s5/utils/lang/bpe/learn_bpe.py @@ -13,6 +13,8 @@ """ from __future__ import unicode_literals +from __future__ import division +from __future__ import print_function import sys import codecs diff --git a/egs/wsj/s5/utils/lang/internal/arpa2fst_constrained.py b/egs/wsj/s5/utils/lang/internal/arpa2fst_constrained.py index 19acd311c3d..31dfd08fbd2 100755 --- a/egs/wsj/s5/utils/lang/internal/arpa2fst_constrained.py +++ b/egs/wsj/s5/utils/lang/internal/arpa2fst_constrained.py @@ -4,6 +4,7 @@ # Apache 2.0. from __future__ import print_function +from __future__ import division import sys import argparse import math @@ -44,7 +45,7 @@ print(' '.join(sys.argv), file = sys.stderr) -class HistoryState: +class HistoryState(object): def __init__(self): # note: neither backoff_prob nor the floats # in word_to_prob are in log space. @@ -56,7 +57,7 @@ def __init__(self): self.word_to_prob = dict() -class ArpaModel: +class ArpaModel(object): def __init__(self): # self.orders is indexed by history-length [i.e. 0 for unigram, # 1 for bigram and so on], and is then a dict indexed diff --git a/egs/wsj/s5/utils/lang/make_phone_lm.py b/egs/wsj/s5/utils/lang/make_phone_lm.py index 47d2a45d229..5cc9a8de832 100755 --- a/egs/wsj/s5/utils/lang/make_phone_lm.py +++ b/egs/wsj/s5/utils/lang/make_phone_lm.py @@ -4,6 +4,7 @@ # Apache 2.0. from __future__ import print_function +from __future__ import division import sys import argparse import math @@ -65,7 +66,7 @@ -class CountsForHistory: +class CountsForHistory(object): ## This class (which is more like a struct) stores the counts seen in a ## particular history-state. It is used inside class NgramCounts. ## It really does the job of a dict from int to float, but it also @@ -77,7 +78,7 @@ def __init__(self): self.total_count = 0 def Words(self): - return self.word_to_count.keys() + return list(self.word_to_count.keys()) def __str__(self): # e.g. returns ' total=12 3->4 4->6 -1->2' @@ -109,7 +110,7 @@ def AddCount(self, predicted_word, count): else: self.word_to_count[predicted_word] = new_count -class NgramCounts: +class NgramCounts(object): ## A note on data-structure. Firstly, all words are represented as ## integers. We store n-gram counts as an array, indexed by (history-length ## == n-gram order minus one) (note: python calls arrays "lists") of dicts @@ -187,7 +188,7 @@ def ApplyBackoff(self): # there will be no unigram. if args.verbose >= 1: initial_num_ngrams = self.GetNumNgrams() - for n in reversed(range(args.no_backoff_ngram_order, args.ngram_order)): + for n in reversed(list(range(args.no_backoff_ngram_order, args.ngram_order))): this_order_counts = self.counts[n] for hist, counts_for_hist in this_order_counts.items(): backoff_hist = hist[1:] @@ -276,8 +277,8 @@ def PruneEmptyStates(self): states_removed_per_hist_len = [ 0 ] * args.ngram_order - for n in reversed(range(args.no_backoff_ngram_order, - args.ngram_order)): + for n in reversed(list(range(args.no_backoff_ngram_order, + args.ngram_order))): num_states_removed = 0 for hist, counts_for_hist in self.counts[n].items(): l = len(counts_for_hist.word_to_count) @@ -304,14 +305,14 @@ def EnsureStructurallyNeededNgramsExist(self): # we have a unigram state]. if args.verbose >= 1: num_ngrams_initial = self.GetNumNgrams() - for n in reversed(range(args.no_backoff_ngram_order, - args.ngram_order)): + for n in reversed(list(range(args.no_backoff_ngram_order, + args.ngram_order))): for hist, counts_for_hist in self.counts[n].items(): # This loop ensures that if we have an n-gram like (6, 7, 8) -> 9, # then, say, (7, 8) -> 9 and (8) -> 9 exist. reduced_hist = hist - for m in reversed(range(args.no_backoff_ngram_order, n)): + for m in reversed(list(range(args.no_backoff_ngram_order, n))): reduced_hist = reduced_hist[1:] # shift an element off # the history. counts_for_backoff_hist = self.counts[m][reduced_hist] @@ -321,7 +322,7 @@ def EnsureStructurallyNeededNgramsExist(self): # then, say, (6, 7) -> 8 and (6) -> 7 exist. This will be needed # for FST representations of the ARPA LM. reduced_hist = hist - for m in reversed(range(args.no_backoff_ngram_order, n)): + for m in reversed(list(range(args.no_backoff_ngram_order, n))): this_word = reduced_hist[-1] reduced_hist = reduced_hist[:-1] # pop an element off the # history @@ -346,7 +347,7 @@ def PrintAsFst(self, word_disambig_symbol): # History will map from history (as a tuple) to integer FST-state. hist_to_state = self.GetHistToStateMap() - for n in [ 1, 0 ] + range(2, args.ngram_order): + for n in [ 1, 0 ] + list(range(2, args.ngram_order)): this_order_counts = self.counts[n] # For order 1, make sure the keys are sorted. keys = this_order_counts.keys() if n != 1 else sorted(this_order_counts.keys()) @@ -388,7 +389,7 @@ def GetProtectedNgrams(self): # add the backed-off n-grams (7, 8) -> 9 and (8) -> 9 to # 'protected-ngrams'. reduced_hist = hist - for m in reversed(range(args.no_backoff_ngram_order, n)): + for m in reversed(list(range(args.no_backoff_ngram_order, n))): reduced_hist = reduced_hist[1:] # shift an element off # the history. @@ -399,7 +400,7 @@ def GetProtectedNgrams(self): # history-state (6, 7, 8), then n-grams (6, 7, 8) and (6, 7) are # protected. This assures that the FST states are accessible. reduced_hist = hist - for m in reversed(range(args.no_backoff_ngram_order, n)): + for m in reversed(list(range(args.no_backoff_ngram_order, n))): ans.add(reduced_hist) reduced_hist = reduced_hist[:-1] # pop an element off the # history @@ -499,7 +500,7 @@ def PruningLogprobChange(self, count, discount, backoff_count, backoff_total): # and the 'count' term is zero in the numerator part of the log expression, # because symbol 'a' is completely backed off in 'this' state. this_a_change = augmented_count * \ - math.log((new_discount * new_backoff_count / new_backoff_total) / \ + math.log((new_discount * new_backoff_count / new_backoff_total)/ \ augmented_count) # other_a_change is the log-like change of symbol 'a' coming from all @@ -511,7 +512,7 @@ def PruningLogprobChange(self, count, discount, backoff_count, backoff_total): # doing so gives us an upper bound on the divergence. other_a_change = \ a_other_count * math.log((new_backoff_count / new_backoff_total) / \ - (backoff_count / backoff_total)) + (backoff_count / backoff_total)) # b_change is the log-like change of phantom symbol 'b' coming from # 'this' state (and note: it only comes from this state, that's how we diff --git a/egs/wsj/s5/utils/nnet/gen_dct_mat.py b/egs/wsj/s5/utils/nnet/gen_dct_mat.py index d0f043ad7a4..24139f1c9f8 100755 --- a/egs/wsj/s5/utils/nnet/gen_dct_mat.py +++ b/egs/wsj/s5/utils/nnet/gen_dct_mat.py @@ -20,12 +20,20 @@ # and takes into account that data-layout is along frequency axis, # while DCT is done along temporal axis. +from __future__ import division +from __future__ import print_function from math import * import sys from optparse import OptionParser +def print_on_same_line(text): + if (sys.version_info > (3,0)): + print(text, end=' ') + else: + print text, + parser = OptionParser() parser.add_option('--fea-dim', dest='dim', help='feature dimension') parser.add_option('--splice', dest='splice', help='applied splice value') @@ -49,19 +57,19 @@ #generate sparse DCT matrix -print '[' +print('[') for k in range(dct_basis): for m in range(dim): for n in range(timeContext): - if(n==0): - print m*'0 ', - else: - print (dim-1)*'0 ', - print str(sqrt(2.0/timeContext)*cos(M_PI/timeContext*k*(n+0.5))), + if(n==0): + print_on_same_line(m*'0 ') + else: + print_on_same_line((dim-1)*'0 ') + print_on_same_line(str(sqrt(2.0/timeContext)*cos(M_PI/timeContext*k*(n+0.5)))) if(n==timeContext-1): - print (dim-m-1)*'0 ', - print - print + print_on_same_line((dim-m-1)*'0 ') + print() + print() -print ']' +print(']') diff --git a/egs/wsj/s5/utils/nnet/gen_hamm_mat.py b/egs/wsj/s5/utils/nnet/gen_hamm_mat.py index a4262a8cffd..d7e9d9b7493 100755 --- a/egs/wsj/s5/utils/nnet/gen_hamm_mat.py +++ b/egs/wsj/s5/utils/nnet/gen_hamm_mat.py @@ -18,12 +18,20 @@ # ./gen_hamm_mat.py # script generates diagonal matrix with hamming window values +from __future__ import division +from __future__ import print_function from math import * import sys from optparse import OptionParser +def print_on_same_line(text): + if (sys.version_info > (3,0)): + print(text, end=' ') + else: + print text, + parser = OptionParser() parser.add_option('--fea-dim', dest='dim', help='feature dimension') parser.add_option('--splice', dest='splice', help='applied splice value') @@ -42,16 +50,16 @@ dim_mat=(2*splice+1)*dim timeContext=2*splice+1 -print '[' +print('[') for row in range(dim_mat): for col in range(dim_mat): if col!=row: - print '0', + print_on_same_line('0') else: i=int(row/dim) - print str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1))), - print + print_on_same_line(str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1)))) + print() -print ']' +print(']') diff --git a/egs/wsj/s5/utils/nnet/gen_splice.py b/egs/wsj/s5/utils/nnet/gen_splice.py index 0241aeed6ba..3fe76513df6 100755 --- a/egs/wsj/s5/utils/nnet/gen_splice.py +++ b/egs/wsj/s5/utils/nnet/gen_splice.py @@ -18,12 +18,19 @@ # ./gen_splice.py # generates Component +from __future__ import print_function from math import * import sys from optparse import OptionParser +def print_on_same_line(text): + if (sys.version_info > (3,0)): + print(text, end=' ') + else: + print text, + parser = OptionParser() parser.add_option('--fea-dim', dest='dim_in', help='feature dimension') parser.add_option('--splice', dest='splice', help='number of frames to concatenate with the central frame') @@ -40,12 +47,12 @@ dim_out=(2*splice+1)*dim_in -print '', dim_out, dim_in -print '[', +print(' {0} {1}'.format(dim_out, dim_in)) +print_on_same_line('[') -splice_vec = range(-splice*splice_step, splice*splice_step+1, splice_step) +splice_vec = list(range(-splice*splice_step, splice*splice_step+1, splice_step)) for idx in range(len(splice_vec)): - print splice_vec[idx], + print_on_same_line(splice_vec[idx]) -print ']' +print(']') diff --git a/egs/wsj/s5/utils/nnet/make_blstm_proto.py b/egs/wsj/s5/utils/nnet/make_blstm_proto.py index 6e540ec791a..4d269cfdef0 100755 --- a/egs/wsj/s5/utils/nnet/make_blstm_proto.py +++ b/egs/wsj/s5/utils/nnet/make_blstm_proto.py @@ -17,6 +17,7 @@ # Generated Nnet prototype, to be initialized by 'nnet-initialize'. +from __future__ import print_function import sys ### @@ -54,7 +55,7 @@ parser.print_help() sys.exit(1) -(feat_dim, num_leaves) = map(int,args); +(feat_dim, num_leaves) = [int(i) for i in args]; # Original prototype from Jiayu, # @@ -77,18 +78,18 @@ # The BLSTM layers, if o.num_layers == 1: # Single BLSTM, - print " %d %d %s" % (feat_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts + print(" %d %d %s" % (feat_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts) else: # >1 BLSTM, - print " %d %d %s" % (feat_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts + print(" %d %d %s" % (feat_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts) for l in range(o.num_layers - 2): - print " %d %d %s" % (2*o.proj_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts - print " %d %d %s" % (2*o.proj_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts + print(" %d %d %s" % (2*o.proj_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts) + print(" %d %d %s" % (2*o.proj_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts) # Adding for more stability, -print " %d %d" % (2*o.proj_dim_last, 2*o.proj_dim_last) +print(" %d %d" % (2*o.proj_dim_last, 2*o.proj_dim_last)) # Softmax layer, -print " %d %d 0.0 0.0" % (2*o.proj_dim_last, num_leaves) + softmax_affine_opts -print " %d %d" % (num_leaves, num_leaves) +print(" %d %d 0.0 0.0" % (2*o.proj_dim_last, num_leaves) + softmax_affine_opts) +print(" %d %d" % (num_leaves, num_leaves)) diff --git a/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py b/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py index 73455563b51..172660da825 100755 --- a/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py +++ b/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py @@ -17,6 +17,8 @@ # Generated Nnet prototype, to be initialized by 'nnet-initialize'. +from __future__ import division +from __future__ import print_function import math, random, sys, warnings from optparse import OptionParser @@ -139,8 +141,8 @@ assert( (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) % o.cnn1_filt_x_step == 0 ) # subsample1 -cnn1_out_fmap_y_len=((1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step)) -cnn1_out_fmap_x_len=((1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step)) +cnn1_out_fmap_y_len=(1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step) +cnn1_out_fmap_x_len=(1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step) # fix filt_len and filt_step def fix_filt_step(inp_len, filt_len, filt_step): @@ -149,7 +151,7 @@ def fix_filt_step(inp_len, filt_len, filt_step): return filt_step else: # filt_step <= filt_len - for filt_step in xrange(filt_len, 0, -1): + for filt_step in range(filt_len, 0, -1): if ((inp_len - filt_len) % filt_step == 0): return filt_step @@ -167,29 +169,29 @@ def fix_filt_step(inp_len, filt_len, filt_step): ### # Begin the prototype -print "" +print("") # Convolutional part of network '''1st CNN layer''' cnn1_input_dim=feat_raw_dim * (o.delta_order+1) * (o.splice*2+1) -cnn1_out_fmap_x_len=((1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step)) -cnn1_out_fmap_y_len=((1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step)) +cnn1_out_fmap_x_len=(1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step) +cnn1_out_fmap_y_len=(1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step) cnn1_output_dim=o.cnn1_num_filters * cnn1_out_fmap_x_len * cnn1_out_fmap_y_len '''1st Pooling layer''' pool1_input_dim=cnn1_output_dim pool1_fmap_x_len=cnn1_out_fmap_x_len -pool1_out_fmap_x_len=((1 + (pool1_fmap_x_len - o.pool1_x_len) / o.pool1_x_step)) +pool1_out_fmap_x_len=(1 + (pool1_fmap_x_len - o.pool1_x_len) / o.pool1_x_step) pool1_fmap_y_len=cnn1_out_fmap_y_len -pool1_out_fmap_y_len=((1 + (pool1_fmap_y_len - o.pool1_y_len) / o.pool1_y_step)) +pool1_out_fmap_y_len=(1 + (pool1_fmap_y_len - o.pool1_y_len) / o.pool1_y_step) pool1_output_dim=o.cnn1_num_filters*pool1_out_fmap_x_len*pool1_out_fmap_y_len '''2nd CNN layer''' cnn2_input_dim=pool1_output_dim cnn2_fmap_x_len=pool1_out_fmap_x_len -cnn2_out_fmap_x_len=((1 + (cnn2_fmap_x_len - o.cnn2_filt_x_len) / o.cnn2_filt_x_step)) +cnn2_out_fmap_x_len=(1 + (cnn2_fmap_x_len - o.cnn2_filt_x_len) / o.cnn2_filt_x_step) cnn2_fmap_y_len=pool1_out_fmap_y_len -cnn2_out_fmap_y_len=((1 + (cnn2_fmap_y_len - o.cnn2_filt_y_len) / o.cnn2_filt_y_step)) +cnn2_out_fmap_y_len=(1 + (cnn2_fmap_y_len - o.cnn2_filt_y_len) / o.cnn2_filt_y_step) cnn2_output_dim=o.cnn2_num_filters * cnn2_out_fmap_x_len * cnn2_out_fmap_y_len @@ -242,14 +244,14 @@ def fix_filt_step(inp_len, filt_len, filt_step): vector += '%d:1:%d ' % (i, i + feat_raw_dim - 1) for i in range(feat_raw_dim+1, (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), feat_raw_dim + o.pitch_dim): vector += '%d:1:%d ' % (i, i + o.pitch_dim - 1) - print ' %d %d %s ' % \ - ((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), vector) - print ' %d %d %s %s ' % \ - ((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), o.num_pitch_neurons + cnn2_output_dim, '%s/nnet.proto.convolution' % o.dirct, '%s/nnet.proto.pitch' % o.dirct) + print(' %d %d %s ' % \ + ((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), vector)) + print(' %d %d %s %s ' % \ + ((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), o.num_pitch_neurons + cnn2_output_dim, '%s/nnet.proto.convolution' % o.dirct, '%s/nnet.proto.pitch' % o.dirct)) num_convolution_output = o.num_pitch_neurons + cnn2_output_dim else: # no pitch - print convolution_proto + print(convolution_proto) # We are done! sys.exit(0) diff --git a/egs/wsj/s5/utils/nnet/make_cnn_proto.py b/egs/wsj/s5/utils/nnet/make_cnn_proto.py index c6aa519ea96..4d8b9ca2946 100755 --- a/egs/wsj/s5/utils/nnet/make_cnn_proto.py +++ b/egs/wsj/s5/utils/nnet/make_cnn_proto.py @@ -17,6 +17,8 @@ # Generated Nnet prototype, to be initialized by 'nnet-initialize'. +from __future__ import division +from __future__ import print_function import math, random, sys from optparse import OptionParser @@ -88,7 +90,7 @@ ### # Begin the prototype -print "" +print("") # Convolutional part of network num_patch1 = 1 + (feat_raw_dim - o.patch_dim1) / o.patch_step1 @@ -150,13 +152,13 @@ vector += '%d:1:%d ' % (i, i + feat_raw_dim - 1) for i in range(feat_raw_dim+1, inputdim_of_cnn + 1, feat_raw_dim + o.pitch_dim): vector += '%d:1:%d ' % (i, i + o.pitch_dim - 1) - print ' %d %d %s ' % \ - (inputdim_of_cnn, inputdim_of_cnn, vector) - print ' %d %d %s %s ' % \ - (inputdim_of_cnn, o.num_pitch_neurons + outputdim_of_cnn, '%s/nnet.proto.convolution' % o.protodir, '%s/nnet.proto.pitch' % o.protodir) + print(' %d %d %s ' % \ + (inputdim_of_cnn, inputdim_of_cnn, vector)) + print(' %d %d %s %s ' % \ + (inputdim_of_cnn, o.num_pitch_neurons + outputdim_of_cnn, '%s/nnet.proto.convolution' % o.protodir, '%s/nnet.proto.pitch' % o.protodir)) else: # no pitch - print convolution_proto + print(convolution_proto) # We are done! sys.exit(0) diff --git a/egs/wsj/s5/utils/nnet/make_lstm_proto.py b/egs/wsj/s5/utils/nnet/make_lstm_proto.py index a2da0a194fc..6818c860ed0 100755 --- a/egs/wsj/s5/utils/nnet/make_lstm_proto.py +++ b/egs/wsj/s5/utils/nnet/make_lstm_proto.py @@ -17,6 +17,7 @@ # Generated Nnet prototype, to be initialized by 'nnet-initialize'. +from __future__ import print_function import sys ### @@ -52,7 +53,7 @@ parser.print_help() sys.exit(1) -(feat_dim, num_leaves) = map(int,args); +(feat_dim, num_leaves) = [int(i) for i in args]; # Original prototype from Jiayu, # @@ -73,14 +74,14 @@ if None != o.param_stddev: softmax_affine_opts += " %f " % o.param_stddev # The LSTM layers, -print " %d %d %s" % (feat_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts +print(" %d %d %s" % (feat_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts) for l in range(o.num_layers - 1): - print " %d %d %s" % (o.proj_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts + print(" %d %d %s" % (o.proj_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts) # Adding for more stability, -print " %d %d" % (o.proj_dim, o.proj_dim) +print(" %d %d" % (o.proj_dim, o.proj_dim)) # Softmax layer, -print " %d %d 0.0 0.0" % (o.proj_dim, num_leaves) + softmax_affine_opts -print " %d %d" % (num_leaves, num_leaves) +print(" %d %d 0.0 0.0" % (o.proj_dim, num_leaves) + softmax_affine_opts) +print(" %d %d" % (num_leaves, num_leaves)) diff --git a/egs/wsj/s5/utils/nnet/make_nnet_proto.py b/egs/wsj/s5/utils/nnet/make_nnet_proto.py index 99198cbe44b..4f60be6c9d0 100755 --- a/egs/wsj/s5/utils/nnet/make_nnet_proto.py +++ b/egs/wsj/s5/utils/nnet/make_nnet_proto.py @@ -17,6 +17,8 @@ # Generated Nnet prototype, to be initialized by 'nnet-initialize'. +from __future__ import division +from __future__ import print_function import math, random, sys, re ### @@ -87,7 +89,7 @@ o.affine_opts = o.affine_opts.replace("_"," ") o.dropout_opts = o.dropout_opts.replace("_"," ") -(feat_dim, num_leaves, num_hid_layers, num_hid_neurons) = map(int,args); +(feat_dim, num_leaves, num_hid_layers, num_hid_neurons) = [int(i) for i in args]; ### End parse options @@ -120,46 +122,46 @@ def Glorot(dim1, dim2): assert(num_hid_layers == 0) if o.bottleneck_trick: # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate - print " %d %d %f %f" % \ + print(" %d %d %f %f" % \ (feat_dim, o.bottleneck_dim, \ - (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim) * 0.75 ), 0.1) + (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim) * 0.75 ), 0.1)) # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases - print " %d %d %f %f %f %f %f %f" % \ + print(" %d %d %f %f %f %f %f %f" % \ (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \ - (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm) + (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm)) else: - print " %d %d %f" % \ + print(" %d %d %f" % \ (feat_dim, o.bottleneck_dim, \ - (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim))) - print " %d %d %f %f %f %f" % \ + (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim)))) + print(" %d %d %f %f %f %f" % \ (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \ - (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm) - print "%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts) # Non-linearity + (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm)) + print("%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)) # Non-linearity # Last AffineTransform (10x smaller learning rate on bias) - print " %d %d %f %f %f %f %f" % \ + print(" %d %d %f %f %f %f %f" % \ (num_hid_neurons, num_leaves, 0.0, 0.0, \ - (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1) + (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1)) # Optionaly append softmax if o.with_softmax: if o.block_softmax_dims == "": - print " %d %d" % (num_leaves, num_leaves) + print(" %d %d" % (num_leaves, num_leaves)) else: - print " %d %d %s" % (num_leaves, num_leaves, o.block_softmax_dims) - print "" + print(" %d %d %s" % (num_leaves, num_leaves, o.block_softmax_dims)) + print("") # We are done! sys.exit(0) # NO HIDDEN LAYERS! # Add only last layer (logistic regression) if num_hid_layers == 0: - print " %d %d %f %f %f" % \ - (feat_dim, num_leaves, 0.0, 0.0, (o.param_stddev_factor * Glorot(feat_dim, num_leaves))) + print(" %d %d %f %f %f" % \ + (feat_dim, num_leaves, 0.0, 0.0, (o.param_stddev_factor * Glorot(feat_dim, num_leaves)))) if o.with_softmax: if o.block_softmax_dims == "": - print " %d %d" % (num_leaves, num_leaves) + print(" %d %d" % (num_leaves, num_leaves)) else: - print " %d %d %s" % (num_leaves, num_leaves, o.block_softmax_dims) - print "" + print(" %d %d %s" % (num_leaves, num_leaves, o.block_softmax_dims)) + print("") # We are done! sys.exit(0) @@ -170,63 +172,63 @@ def Glorot(dim1, dim2): # Begin the prototype, # First AffineTranform, -print " %d %d %f %f %f %f %s" % \ +print(" %d %d %f %f %f %f %s" % \ (feat_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \ (o.param_stddev_factor * Glorot(feat_dim, num_hid_neurons) * \ - (math.sqrt(1.0/12.0) if o.smaller_input_weights else 1.0)), o.max_norm, o.affine_opts) + (math.sqrt(1.0/12.0) if o.smaller_input_weights else 1.0)), o.max_norm, o.affine_opts)) # Note.: compensating dynamic range mismatch between input features and Sigmoid-hidden layers, # i.e. mapping the std-dev of N(0,1) (input features) to std-dev of U[0,1] (sigmoid-outputs). # This is done by multiplying with stddev(U[0,1]) = sqrt(1/12). # The stddev of weights is consequently reduced with scale 0.29, -print "%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts) +print("%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)) if o.with_dropout: - print " %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts) + print(" %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)) # Internal AffineTransforms, for i in range(num_hid_layers-1): - print " %d %d %f %f %f %f %s" % \ + print(" %d %d %f %f %f %f %s" % \ (num_hid_neurons, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \ - (o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm, o.affine_opts) - print "%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts) + (o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm, o.affine_opts)) + print("%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)) if o.with_dropout: - print " %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts) + print(" %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)) # Optionaly add bottleneck, if o.bottleneck_dim != 0: assert(o.bottleneck_dim > 0) if o.bottleneck_trick: # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate - print " %d %d %f %f" % \ + print(" %d %d %f %f" % \ (num_hid_neurons, o.bottleneck_dim, \ - (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim) * 0.75 ), 0.1) + (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim) * 0.75 ), 0.1)) # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases - print " %d %d %f %f %f %f %f %f %s" % \ + print(" %d %d %f %f %f %f %f %f %s" % \ (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \ - (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm, o.affine_opts) + (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm, o.affine_opts)) else: # Same learninig-rate and stddev-formula everywhere, - print " %d %d %f" % \ + print(" %d %d %f" % \ (num_hid_neurons, o.bottleneck_dim, \ - (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim))) - print " %d %d %f %f %f %f %s" % \ + (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim)))) + print(" %d %d %f %f %f %f %s" % \ (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \ - (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm, o.affine_opts) - print "%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts) + (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm, o.affine_opts)) + print("%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)) if o.with_dropout: - print " %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts) + print(" %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)) # Last AffineTransform (10x smaller learning rate on bias) -print " %d %d %f %f %f %f %f" % \ +print(" %d %d %f %f %f %f %f" % \ (num_hid_neurons, num_leaves, 0.0, 0.0, \ - (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1) + (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1)) # Optionaly append softmax if o.with_softmax: if o.block_softmax_dims == "": - print " %d %d" % (num_leaves, num_leaves) + print(" %d %d" % (num_leaves, num_leaves)) else: - print " %d %d %s" % (num_leaves, num_leaves, o.block_softmax_dims) + print(" %d %d %s" % (num_leaves, num_leaves, o.block_softmax_dims)) # We are done! sys.exit(0) diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh index 6439a136288..7c018fd94f9 100755 --- a/egs/wsj/s5/utils/prepare_lang.sh +++ b/egs/wsj/s5/utils/prepare_lang.sh @@ -108,6 +108,11 @@ srcdir=$1 oov_word=$2 tmpdir=$3 dir=$4 + + +if [ -d $dir/phones ]; then + rm -r $dir/phones +fi mkdir -p $dir $tmpdir $dir/phones silprob=false @@ -213,7 +218,6 @@ else paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt fi -mkdir -p $dir/phones # various sets of phones... # Sets of phones for use in clustering, and making monophone systems. diff --git a/egs/wsj/s5/utils/reverse_arpa.py b/egs/wsj/s5/utils/reverse_arpa.py index 5437aec4341..e154a6e0813 100755 --- a/egs/wsj/s5/utils/reverse_arpa.py +++ b/egs/wsj/s5/utils/reverse_arpa.py @@ -2,11 +2,12 @@ # -*- coding: utf-8 -*- # Copyright 2012 Mirko Hannemann BUT, mirko.hannemann@gmail.com +from __future__ import print_function import sys import codecs # for UTF-8/unicode if len(sys.argv) != 2: - print 'usage: reverse_arpa arpa.in' + print('usage: reverse_arpa arpa.in') sys.exit() arpaname = sys.argv[1] @@ -34,13 +35,13 @@ try: file = codecs.open(arpaname, "r", "utf-8") except IOError: - print 'file not found: ' + arpaname + print('file not found: ' + arpaname) sys.exit() text=file.readline() while (text and text[:6] != "\\data\\"): text=file.readline() if not text: - print "invalid ARPA file" + print("invalid ARPA file") sys.exit() #print text, while (text and text[:5] != "ngram"): text=file.readline() @@ -54,7 +55,7 @@ r = ind[0].split() read_n = int(r[1].strip()) if read_n != n+1: - print "invalid ARPA file:", text + print("invalid ARPA file: {}".format(text)) sys.exit() n = read_n cngrams.append(counts) @@ -68,7 +69,7 @@ for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams while (text and "-grams:" not in text): text=file.readline() if n != int(text[1]): - print "invalid ARPA file:", text + print("invalid ARPA file:{}".format(text)) sys.exit() #print text,cngrams[n-1] this_ngrams={} # stores all read ngrams @@ -115,7 +116,7 @@ while (text and text[:5] != "\\end\\"): text=file.readline() if not text: - print "invalid ARPA file" + print("invalid ARPA file") sys.exit() file.close() #print text, @@ -133,14 +134,13 @@ #p(ABCD)+b(ABCD)-p(BCD)+p(ABC)-p(BC)+p(AB)-p(B)+p(A) DCBA 0 # compute new reversed ARPA model -print "\\data\\" +print("\\data\\") for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams - print "ngram "+str(n)+"="+str(len(ngrams[n-1].keys())) + print("ngram {0} = {1}".format(n, len(ngrams[n-1].keys()))) offset = 0.0 for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams - print "\\"+str(n)+"-grams:" - keys = ngrams[n-1].keys() - keys.sort() + print("\\{}-grams:".format(n)) + keys = sorted(ngrams[n-1].keys()) for ngram in keys: prob = ngrams[n-1][ngram] # reverse word order @@ -179,10 +179,10 @@ elif n == 2: revprob = revprob + offset # add weight to bigrams starting with if (prob[1] != inf): # only backoff weights from not newly created ngrams - print revprob,rev_ngram.encode("utf-8"),back + print(revprob,rev_ngram.encode("utf-8"),back) else: - print revprob,rev_ngram.encode("utf-8"),"-100000.0" + print(revprob,rev_ngram.encode("utf-8"),"-100000.0") else: # highest order - no backoff weights if (n==2) and (rev_ngram[:3] == ""): revprob = revprob + offset - print revprob,rev_ngram.encode("utf-8") -print "\\end\\" + print(revprob,rev_ngram.encode("utf-8")) +print("\\end\\") diff --git a/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py b/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py index 77a6791d5d7..7135bb1b242 100755 --- a/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py +++ b/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py @@ -110,7 +110,7 @@ def bounding_area(index, hull): return {'area': len_p * len_o, 'length_parallel': len_p, 'length_orthogonal': len_o, - 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'rectangle_center': (min_p + float(len_p) / 2, min_o + float(len_o) / 2), 'unit_vector': unit_vector_p, } @@ -275,8 +275,8 @@ def get_center(im): ------- (int, int): center of the image """ - center_x = im.size[0] / 2 - center_y = im.size[1] / 2 + center_x = float(im.size[0]) / 2 + center_y = float(im.size[1]) / 2 return int(center_x), int(center_y) diff --git a/egs/yomdle_fa/v1/local/gedi2csv.py b/egs/yomdle_fa/v1/local/gedi2csv.py index 43a07421dd1..0b80c2e80bb 100755 --- a/egs/yomdle_fa/v1/local/gedi2csv.py +++ b/egs/yomdle_fa/v1/local/gedi2csv.py @@ -55,7 +55,7 @@ def npbox2string(npar): # cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) # Get 4 corners of the rectangle using cv2.boxPoints() -class GEDI2CSV(): +class GEDI2CSV(object): """ Initialize the extractor""" def __init__(self, logger, args): diff --git a/egs/yomdle_fa/v1/local/yomdle2csv.py b/egs/yomdle_fa/v1/local/yomdle2csv.py index 3641de90324..8f208e2d968 100755 --- a/egs/yomdle_fa/v1/local/yomdle2csv.py +++ b/egs/yomdle_fa/v1/local/yomdle2csv.py @@ -55,7 +55,7 @@ def npbox2string(npar): # cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) # Get 4 corners of the rectangle using cv2.boxPoints() -class GEDI2CSV(): +class GEDI2CSV(object): """ Initialize the extractor""" def __init__(self, logger, args): diff --git a/egs/yomdle_korean/README.txt b/egs/yomdle_korean/README.txt new file mode 100644 index 00000000000..3bf4cc8cd2d --- /dev/null +++ b/egs/yomdle_korean/README.txt @@ -0,0 +1,3 @@ +This directory contains example scripts for OCR on the Yomdle and Slam datasets. +Training is done on the Yomdle dataset and testing is done on Slam. +LM rescoring is also done with extra corpus data obtained from various sources diff --git a/egs/yomdle_korean/v1/cmd.sh b/egs/yomdle_korean/v1/cmd.sh new file mode 100755 index 00000000000..3d69546dfe8 --- /dev/null +++ b/egs/yomdle_korean/v1/cmd.sh @@ -0,0 +1,12 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export cmd="queue.pl" diff --git a/egs/yomdle_korean/v1/image b/egs/yomdle_korean/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/yomdle_korean/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/yomdle_korean/v1/local/augment_data.sh b/egs/yomdle_korean/v1/local/augment_data.sh new file mode 100755 index 00000000000..136bfd24eb2 --- /dev/null +++ b/egs/yomdle_korean/v1/local/augment_data.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +verticle_shift=0 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 + +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp" + +for set in aug1; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ + --fliplr false --augment 'random_scale' $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1 +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/yomdle_korean/v1/local/chain/compare_wer.sh b/egs/yomdle_korean/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..80f31e0f311 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/compare_wer.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) " +for x in $*; do + wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) " +for x in $*; do + cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..fcf59f917c1 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1b.sh \ No newline at end of file diff --git a/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh new file mode 100755 index 00000000000..cea60a221a1 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# score_basic score_nomalized +# WER 13.64 10.6 +# WER (rescored) 13.13 10.2 +# CER 2.99 3.0 +# CER (rescored) 2.88 2.9 +# Final train prob 0.0113 +# Final valid prob 0.0152 +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=48 nj=5..8 num-params=3.0M dim=40->352 combine=0.047->0.047 (over 2) logprob:train/valid[31,47,final]=(0.002,0.008,0.011/0.008,0.013,0.015) + +set -e +# configs for 'chain' +stage=0 +nj=30 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +lang_decode=data/lang +decode_e2e=true +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.apply-deriv-weights true \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 5 \ + --trainer.optimization.num-jobs-final 8 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..c43d7c669c1 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -0,0 +1,236 @@ +#!/bin/bash + +# e2eali_1a is the same as 1a but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh exp/old/chain/cnn_e2eali_1a/ +# System cnn_e2eali_1a +# WER 15.68 +# CER 3.18 +# Final train prob -0.0331 +# Final valid prob -0.0395 + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/ +# exp/old/chain/cnn_e2eali_1a/: num-iters=33 nj=3..16 num-params=5.2M dim=40->456 combine=-0.035->-0.035 (over 1) xent:train/valid[21,32,final]=(-0.226,-0.175,-0.169/-0.248,-0.202,-0.195) logprob:train/valid[21,32,final]=(-0.039,-0.034,-0.033/-0.046,-0.040,-0.039) + +# Normalize scoring +# WER = 11.7 +# CER = 3.3 + +set -e -o pipefail +stage=0 +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +decode_chain=false +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ] && $decode_chain; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ] && $decode_chain; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --beam 12 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 + + echo "Done. Date: $(date). Results:" + local/chain/compare_wer.sh $dir +fi diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..8fca9235f46 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -0,0 +1,208 @@ +#!/bin/bash + +# e2eali_1b is the same as e2eali_1a but has fewer CNN layers, smaller +# l2-regularize, more epochs and uses dropout. + +#local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/ +# System cnn_e2eali_1b +# score_basic score_nomalized +# WER 13.01 10.0 +# WER (rescored) 12.69 9.6 +# CER 2.78 3.0 +# CER (rescored) 2.70 2.8 +# Final train prob -0.0568 +# Final valid prob -0.0410 +#steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +#exp/chain/cnn_e2eali_1b: num-iters=67 nj=3..16 num-params=5.2M dim=40->464 combine=-0.052->-0.052 (over 1) xent:train/valid[43,66,final]=(-0.379,-0.319,-0.304/-0.291,-0.234,-0.227) logprob:train/valid[43,66,final]=(-0.069,-0.058,-0.057/-0.046,-0.041,-0.041) +set -e -o pipefail +stage=0 +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=1000 +# we don't need extra left/right context for TDNN systems. +tdnn_dim=550 +# training options +srand=0 +remove_egs=false +lang_decode=data/lang +decode_chain=true +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=16 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi diff --git a/egs/yomdle_korean/v1/local/check_tools.sh b/egs/yomdle_korean/v1/local/check_tools.sh new file mode 100755 index 00000000000..5b4d3107d3b --- /dev/null +++ b/egs/yomdle_korean/v1/local/check_tools.sh @@ -0,0 +1,43 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread']" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/yomdle_korean/v1/local/extract_features.sh b/egs/yomdle_korean/v1/local/extract_features.sh new file mode 100755 index 00000000000..3880ebad3e8 --- /dev/null +++ b/egs/yomdle_korean/v1/local/extract_features.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment='no_aug' +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/yomdle_korean/v1/local/normalize_data.py b/egs/yomdle_korean/v1/local/normalize_data.py new file mode 100755 index 00000000000..fba3e762789 --- /dev/null +++ b/egs/yomdle_korean/v1/local/normalize_data.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 +# This script converts a BPE-encoded text to normal text. It is used in scoring + +import sys, io +import string +import unicodedata +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in infile: + words = line.strip().split() + uttid = words[0] + transcript = ' '.join(words[1:]) + text_normalized = unicodedata.normalize('NFC', transcript) + output.write(uttid + ' ' + text_normalized + '\n') diff --git a/egs/yomdle_korean/v1/local/prepare_dict.sh b/egs/yomdle_korean/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..22db5ae834d --- /dev/null +++ b/egs/yomdle_korean/v1/local/prepare_dict.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Babak Rekabdar +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +. ./utils/parse_options.sh || exit 1; + +mkdir -p $dir + +local/prepare_lexicon.py $dir + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/yomdle_korean/v1/local/prepare_lexicon.py b/egs/yomdle_korean/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..ec8d43d8335 --- /dev/null +++ b/egs/yomdle_korean/v1/local/prepare_lexicon.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Babak Rekabdar +# 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora +# Apache 2.0 + +# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text. +# Since this lexicon is based on BPE, it replaces '|' with silence. + +import argparse +import os +import unicodedata +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data', 'train', 'text') +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + for i in range(1, len(line_vect)): + char_normalized = unicodedata.normalize('NFD', line_vect[i]).replace('\n', '') + characters = list(char_normalized) + characters = " ".join([ 'SIL' if char == '|' else char for char in characters]) + characters = list(characters) + characters = "".join([ '' if char == '#' else char for char in characters]) + lex[line_vect[i]] = characters + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/yomdle_korean/v1/local/process_corpus.py b/egs/yomdle_korean/v1/local/process_corpus.py new file mode 100755 index 00000000000..b39030270b7 --- /dev/null +++ b/egs/yomdle_korean/v1/local/process_corpus.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# Copyright 2018 Ashish Arora +# Apache 2.0 +# This script reads valid phones and removes the lines in the corpus +# which have any other phone. + +import os +import sys, io + +phone_file = os.path.join('data/local/text/cleaned/phones.txt') +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +phone_dict = dict() +with open(phone_file, 'r', encoding='utf-8') as phone_fh: + for line in phone_fh: + line = line.strip().split()[0] + phone_dict[line] = line + +phone_dict[' '] = ' ' +corpus_text = list() +for line in infile: + text = line.strip() + skip_text = False + for phone in text: + if phone not in phone_dict.keys(): + skip_text = True + break + if not skip_text: + output.write(text+ '\n') + diff --git a/egs/yomdle_korean/v1/local/process_data.py b/egs/yomdle_korean/v1/local/process_data.py new file mode 100755 index 00000000000..d7546b0a803 --- /dev/null +++ b/egs/yomdle_korean/v1/local/process_data.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# 2018 Chun Chieh Chang + +""" This script reads the extracted Tamil OCR (yomdle and slam) database files + and creates the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train + + Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that + utt2spk file: english_phone_books_0001_0 english_phone_books_0001 + images.scp file: english_phone_books_0001_0 \ + data/download/truth_line_image/english_phone_books_0001_0.png +""" + +import argparse +import os +import sys +import csv +import itertools +import unicodedata +import re +import string +import unicodedata +parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files") +parser.add_argument('database_path', type=str, help='Path to data') +parser.add_argument('data_split', type=str, help='Path to file that contain datasplits') +parser.add_argument('out_dir', type=str, help='directory to output files') +args = parser.parse_args() + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +with open(args.data_split) as f: + for line in f: + line = line.strip() + image_id = line + image_filename = image_id + '.png' + image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename) + if not os.path.isfile (image_filepath): + print("File does not exist {}".format(image_filepath)) + continue + line_id = int(line.split('_')[-1]) + csv_filename = '_'.join(line.split('_')[:-1]) + '.csv' + csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename) + csv_file = open(csv_filepath, 'r', encoding='utf-8') + for row in csv.reader(csv_file): + if row[1] == image_filename: + text = row[11] + text_vect = text.split() # this is to avoid non-utf-8 spaces + text = " ".join(text_vect) + #text_normalized = unicodedata.normalize('NFD', text).replace('\n', '') + if not text: + continue + text_fh.write(image_id + ' ' + text + '\n') + utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n') + image_fh.write(image_id + ' ' + image_filepath + '\n') diff --git a/egs/yomdle_korean/v1/local/score.sh b/egs/yomdle_korean/v1/local/score.sh new file mode 100755 index 00000000000..31564d25326 --- /dev/null +++ b/egs/yomdle_korean/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh new file mode 100755 index 00000000000..654880fcf59 --- /dev/null +++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh @@ -0,0 +1,327 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# 2018 Ashish Arora +# Apache 2.0 +# This script is semi-supervised recipe with 25k line images of supervised data +# and 22k line images of unsupervised data with naive splitting. +# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI", +# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018 +# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf +# local/semisup/run_semisup.sh shows how to call this. + +# We use 3-gram LM trained on 5M lines of auxilary data. +# This script uses the same tree as that for the seed model. +# Unsupervised set: train_unsup (25k tamil line images) +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervised): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices +# output-0 and output-1 are for superivsed and unsupervised data respectively. + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/ exp/semisup_100k/chain/tdnn_semisup_1a/ +# System cnn_e2eali_1b tdnn_semisup_1a +# WER 15.06 13.83 +# CER 3.15 2.83 +# Final train prob -0.0343 0.6103-0.0360 +# Final valid prob -0.0403 0.6054-0.0418 + +# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1a/ +# exp/semisup_100k/chain/tdnn_semisup_1a/: num-iters=58 nj=6..16 num-params=3.7M dim=40->456 combine=0.240->0.240 (over 1) + +# Normalize scoring +#WER = 10.4 +#CER = 2.9 + +set -u -e -o pipefail + +stage=0 # Start from -1 for supervised seed system training +train_stage=-100 +nj=30 +test_nj=30 + +# The following 3 options decide the output directory for semi-supervised +# chain system +# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix} +exp_root=exp/semisup_100k +chain_affix= # affix for chain dir +tdnn_affix=_semisup_1a # affix for semi-supervised chain system + +# Datasets-Expects supervised_set and unsupervised_set +supervised_set=train +unsupervised_set=train_unsup + +# Input seed system +sup_chain_dir=exp/chain/cnn_e2eali_1b # supervised chain system +sup_lat_dir=exp/chain/e2e_train_lats # Seed model options +sup_tree_dir=exp/chain/tree_e2e # tree directory for supervised chain system + +# Semi-supervised options +supervision_weights=1.0,1.0 # Weights for supervised, unsupervised data egs. + # Can be used to scale down the effect of unsupervised data + # by using a smaller scale for it e.g. 1.0,0.3 +lm_weights=3,2 # Weights on phone counts from supervised, unsupervised data for denominator FST creation + +sup_egs_dir= # Supply this to skip supervised egs creation +unsup_egs_dir= # Supply this to skip unsupervised egs creation +unsup_egs_opts= # Extra options to pass to unsupervised egs creation +# Neural network opts +xent_regularize=0.1 +tdnn_dim=450 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + + # We use separate outputs for supervised and unsupervised data + # so we can properly track the train and valid objectives. + output name=output-0 input=output.affine + output name=output-1 input=output.affine + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +# Get values for $model_left_context, $model_right_context +. $dir/configs/vars + +left_context=$model_left_context +right_context=$model_right_context + +egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") +egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_$supervised_set + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$cmd" \ + --left-tolerance 3 --right-tolerance 3 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --frames-overlap-per-eg 0 --constrained false \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 2000000 \ + --cmvn-opts "$cmvn_opts" \ + --generate-egs-scp true \ + data/${supervised_set} $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=340,300,200,100 # Using a frames-per-eg of 150 for unsupervised data + # was found to be better than allowing smaller chunks + # (160,140,110,80) like for supervised system +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices when + # creating numerator supervision +lattice_prune_beam=6.0 # beam for pruning the lattices prior to getting egs + # for unsupervised data +tolerance=3 # frame-tolerance for chain training + +unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_$unsupervised_set + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh \ + --cmd "$cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \ + --generate-egs-scp true $unsup_egs_opts \ + data/$unsupervised_set $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + # This is to skip stages of den-fst creation, which was already done. + train_stage=-4 +fi + +chunk_width=340,300,200,100 +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --egs.chunk-width=$chunk_width \ + --cmd "$cmd" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00001 \ + --chain.apply-deriv-weights=true \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \ + --trainer.srand=0 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --trainer.frames-per-iter=2000000 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 5 \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs false \ + --feat-dir data/$supervised_set \ + --tree-dir $sup_tree_dir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph +fi + +if [ $stage -le 18 ]; then + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi +exit 0; + diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh new file mode 100755 index 00000000000..eb688151665 --- /dev/null +++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh @@ -0,0 +1,325 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# 2018 Ashish Arora +# Apache 2.0 +# This script is semi-supervised recipe with 25k line images of supervised data +# and 22k line images of unsupervised data with naive splitting. +# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI", +# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018 +# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf +# local/semisup/run_semisup.sh shows how to call this. + +# We use 3-gram LM trained on 5M lines of auxilary data. +# This script uses the same tree as that for the seed model. +# Unsupervised set: train_unsup (25k tamil line images) +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervised): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices +# output-0 and output-1 are for superivsed and unsupervised data respectively. + +# local/chain/compare_wer.sh exp/semisup_100k/chain/tdnn_semisup_1b/ +# System tdnn_semisup_1b +# score_basic score_normalized +# WER 13.73 10.2 +# WER (rescored) 12.80 9.4 +# CER 2.78 2.8 +# CER (rescored) 2.57 2.7 +# Final train prob 0.6138-0.0337 +# Final valid prob 0.6115-0.0399 + +# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1b/ +# exp/semisup_100k/chain/tdnn_semisup_1b/: num-iters=46 nj=6..16 num-params=5.7M dim=40->456 combine=0.239->0.239 (over 1) + +set -u -e -o pipefail +stage=0 # Start from -1 for supervised seed system training +train_stage=-100 +nj=30 +test_nj=30 + +# The following 3 options decide the output directory for semi-supervised +# chain system +# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix} +exp_root=exp/semisup_100k +chain_affix= # affix for chain dir +tdnn_affix=_semisup_1b # affix for semi-supervised chain system + +# Datasets-Expects supervised_set and unsupervised_set +supervised_set=train +unsupervised_set=train_unsup + +# Input seed system +sup_chain_dir=exp/chain/cnn_e2eali_1b # supervised chain system +sup_lat_dir=exp/chain/e2e_train_lats # Seed model options +sup_tree_dir=exp/chain/tree_e2e # tree directory for supervised chain system + +# Semi-supervised options +supervision_weights=1.0,1.0 # Weights for supervised, unsupervised data egs. + # Can be used to scale down the effect of unsupervised data + # by using a smaller scale for it e.g. 1.0,0.3 +lm_weights=3,2 # Weights on phone counts from supervised, unsupervised data for denominator FST creation + +sup_egs_dir= # Supply this to skip supervised egs creation +unsup_egs_dir= # Supply this to skip unsupervised egs creation +unsup_egs_opts= # Extra options to pass to unsupervised egs creation +# Neural network opts +xent_regularize=0.1 +tdnn_dim=550 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +dropout_schedule='0,0@0.20,0.2@0.50,0' +dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + + # We use separate outputs for supervised and unsupervised data + # so we can properly track the train and valid objectives. + output name=output-0 input=output.affine + output name=output-1 input=output.affine + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +# Get values for $model_left_context, $model_right_context +. $dir/configs/vars + +left_context=$model_left_context +right_context=$model_right_context + +egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") +egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_$supervised_set + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$cmd" \ + --left-tolerance 3 --right-tolerance 3 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --frames-overlap-per-eg 0 --constrained false \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 2000000 \ + --cmvn-opts "$cmvn_opts" \ + --generate-egs-scp true \ + data/${supervised_set} $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=340,300,200,100 # Using a frames-per-eg of 150 for unsupervised data + # was found to be better than allowing smaller chunks + # (160,140,110,80) like for supervised system +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices when + # creating numerator supervision +lattice_prune_beam=6.0 # beam for pruning the lattices prior to getting egs + # for unsupervised data +tolerance=3 # frame-tolerance for chain training + +unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_$unsupervised_set + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh \ + --cmd "$cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \ + --generate-egs-scp true $unsup_egs_opts \ + data/$unsupervised_set $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + # This is to skip stages of den-fst creation, which was already done. + train_stage=-4 +fi + +chunk_width=340,300,200,100 +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --egs.chunk-width=$chunk_width \ + --cmd "$cmd" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00001 \ + --chain.apply-deriv-weights=true \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \ + --trainer.srand=0 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --trainer.frames-per-iter=2000000 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 16 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs false \ + --feat-dir data/$supervised_set \ + --tree-dir $sup_tree_dir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph +fi + +if [ $stage -le 18 ]; then + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi +exit 0; + diff --git a/egs/yomdle_korean/v1/local/semisup/process_data.py b/egs/yomdle_korean/v1/local/semisup/process_data.py new file mode 100755 index 00000000000..94ad770ec2d --- /dev/null +++ b/egs/yomdle_korean/v1/local/semisup/process_data.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# 2018 Chun Chieh Chang + +""" This script reads the slam boxed Tamil OCR dataset and creates the following + files utt2spk, images.scp. Since boxed data do not have transcripts, it do not + creates text file. It is created as a separate script, because the data that + local/process_data.py is processing contains some empty transcripts which + should be removed or it will create bug while applying BPE. + + Eg. local/semisup/process_data.py data/download/ data/local/splits/train_unsup.txt + data/train_unsup + + Eg. utt2spk file: english_phone_books_0001_0 english_phone_books_0001 + images.scp file: english_phone_books_0001_0 \ + data/download/truth_line_image/english_phone_books_0001_0.png +""" +import argparse +import os +import sys +import csv +import itertools +import unicodedata +import re +import string +parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files") +parser.add_argument('database_path', type=str, help='Path to data') +parser.add_argument('data_split', type=str, help='Path to file that contain datasplits') +parser.add_argument('out_dir', type=str, help='directory to output files') +args = parser.parse_args() + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') + +with open(args.data_split) as f: + for line in f: + line = line.strip() + image_id = line + image_filename = image_id + '.png' + image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename) + if not os.path.isfile (image_filepath): + print("File does not exist {}".format(image_filepath)) + continue + line_id = int(line.split('_')[-1]) + csv_filename = '_'.join(line.split('_')[:-1]) + '.csv' + csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename) + csv_file = open(csv_filepath, 'r', encoding='utf-8') + for row in csv.reader(csv_file): + if row[1] == image_filename: + text = 'semisup' + text_fh.write(image_id + ' ' + text + '\n') + utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n') + image_fh.write(image_id + ' ' + image_filepath + '\n') diff --git a/egs/yomdle_korean/v1/local/semisup/run_semisup.sh b/egs/yomdle_korean/v1/local/semisup/run_semisup.sh new file mode 100755 index 00000000000..5e20f50c99e --- /dev/null +++ b/egs/yomdle_korean/v1/local/semisup/run_semisup.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# 2018 Ashish Arora +# Apache 2.0 + +# This script demonstrates semi-supervised training using 25k line images of +# supervised data and 22k line images of unsupervised data. +# We assume the supervised data is in data/train and unsupervised data +# is in data/train_unsup. +# For LM training, we use 5 million lines of tamil text. + +set -e +set -o pipefail +stage=0 +nj=30 +exp_root=exp/semisup_56k +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +mkdir -p data/train_unsup/data +if [ $stage -le 0 ]; then + echo "stage 0: Processing train unsupervised data...$(date)" + local/semisup/process_data.py data/download/ \ + data/local/splits/train_unsup.txt \ + data/train_unsup + image/fix_data_dir.sh data/train_unsup +fi + +if [ $stage -le 1 ]; then + echo "stage 1: Obtaining image groups. calling get_image2num_frames..." + image/get_image2num_frames.py --feat-dim 40 data/train_unsup + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train_unsup + echo "Extracting features and calling compute_cmvn_stats: $(date) " + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train_unsup + steps/compute_cmvn_stats.sh data/train_unsup || exit 1; + image/fix_data_dir.sh data/train_unsup +fi + +for f in data/train/utt2spk data/train_unsup/utt2spk \ + data/train/text; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1; + fi +done + +# Prepare semi-supervised train set +if [ $stage -le 1 ]; then + utils/combine_data.sh data/semisup100k_250k \ + data/train data/train_unsup || exit 1 +fi + +############################################################################### +# Semi-supervised training using 25k line images supervised data and +# 22k hours unsupervised data. We use tree, lattices +# and seed chain system from the previous stage. +############################################################################### +if [ $stage -le 2 ]; then + local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh \ + --supervised-set train \ + --unsupervised-set train_unsup \ + --sup-chain-dir exp/chain/cnn_e2eali_1b_ep16_7cnn \ + --sup-lat-dir exp/chain/e2e_train_lats \ + --sup-tree-dir exp/chain/tree_e2e \ + --chain-affix "" \ + --tdnn-affix _semisup_ep16_7cnn \ + --stage 15 --train_stage 9 \ + --exp-root $exp_root || exit 1 +fi diff --git a/egs/yomdle_korean/v1/local/train_lm.sh b/egs/yomdle_korean/v1/local/train_lm.sh new file mode 100755 index 00000000000..c73c42fb7dc --- /dev/null +++ b/egs/yomdle_korean/v1/local/train_lm.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the training transcriptions and corpus text. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +order=6 +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/local/text/cleaned/bpe_val.txt > ${dir}/data/text/dev.txt + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from train and corpus text + cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 10 million n-grams for a big LM for rescoring purposes. + size=10000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + #[perplexity = 22.0613098868] over 151116.0 words + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 2 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=2000000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + #[perplexity = 23.4801171202] over 151116.0 words + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/yomdle_korean/v1/local/wer_output_filter b/egs/yomdle_korean/v1/local/wer_output_filter new file mode 100755 index 00000000000..59e364e0231 --- /dev/null +++ b/egs/yomdle_korean/v1/local/wer_output_filter @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 +# This script converts a BPE-encoded text to normal text. It is used in scoring + +import sys, io +import string +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in infile: + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + output.write(uttid + ' ' + transcript + '\n') diff --git a/egs/yomdle_korean/v1/local/yomdle b/egs/yomdle_korean/v1/local/yomdle new file mode 120000 index 00000000000..2c4544c1399 --- /dev/null +++ b/egs/yomdle_korean/v1/local/yomdle @@ -0,0 +1 @@ +../../../yomdle_tamil/v1/local/yomdle/ \ No newline at end of file diff --git a/egs/yomdle_korean/v1/path.sh b/egs/yomdle_korean/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/yomdle_korean/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/yomdle_korean/v1/run_end2end.sh b/egs/yomdle_korean/v1/run_end2end.sh new file mode 100755 index 00000000000..65f5beb4b08 --- /dev/null +++ b/egs/yomdle_korean/v1/run_end2end.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# Copyright 2018 Hossein Hadian +# Ashish Arora +# Jonathan Chang +# Apache 2.0 + +set -e +stage=0 +nj=30 + +language_main=Korean +slam_dir=/export/corpora5/slam/SLAM/ +yomdle_dir=/export/corpora5/slam/YOMDLE/ +corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ko/ +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +./local/check_tools.sh +# Start from stage=-2 for data preparation. This stage stores line images, +# csv files and splits{train,test,train_unsup} data/download/truth_line_image, +# data/download/truth_csv and data/local/splits respectively. +if [ $stage -le -2 ]; then + echo "$(date): preparing data, obtaining line images and csv files..." + local/yomdle/create_download_dir.sh --language_main $language_main \ + --slam_dir $slam_dir --yomdle_dir $yomdle_dir +fi + +if [ $stage -le -1 ]; then + echo "$(date): getting corpus text for language modelling..." + mkdir -p data/local/text/cleaned + cat $corpus_dir/* > data/local/text/ko.txt + head -20000 data/local/text/ko.txt > data/local/text/cleaned/val.txt + tail -n +20000 data/local/text/ko.txt > data/local/text/cleaned/corpus.txt +fi + +mkdir -p data/{train,test}/data +if [ $stage -le 0 ]; then + echo "$0 stage 0: Processing train and test data.$(date)" + echo " creating text, images.scp, utt2spk and spk2utt" + #local/prepare_data.sh data/download/ + for set in train test; do + local/process_data.py data/download/ \ + data/local/splits/${set}.txt data/${set} + image/fix_data_dir.sh data/${set} + done +fi + +if [ $stage -le 1 ]; then + echo "$(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + for set in train test; do + echo "$(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; + done + image/fix_data_dir.sh data/train +fi + +if [ $stage -le 3 ]; then + echo "$(date) stage 3: BPE preparation" + # getting non-silence phones. + cut -d' ' -f2- data/train/text | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; + +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/text/cleaned/phones.txt + + cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt + + echo "learning BPE..." + # it is currently learned with only training text but we can also use all corpus text + # to learn BPE. phones are added so that one isolated occurance of every phone exists. + cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$(date) stage 4: applying BPE..." + echo "applying BPE on train, test text..." + for set in test train; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \ + sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids + done + + echo "applying BPE to corpus text..." + cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \ + sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt + cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \ + sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt +fi + +if [ $stage -le 5 ]; then + echo "$(date) stage 5: Preparing dictionary and lang..." + local/prepare_dict.sh --dir data/local/dict + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 6 ]; then + echo "$(date) stage 6: Calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh +fi + +if [ $stage -le 7 ]; then + echo "$(date) stage 7: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +chunk_width='340,300,200,100' +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if [ $stage -le 8 ]; then + echo "$(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width +fi + +if [ $stage -le 9 ]; then + echo "$(date) stage 9: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g +fi + +if [ $stage -le 10 ] && $decode_e2e; then + echo "$(date) stage 10: decoding end2end setup..." + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1; + + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" --beam 12 \ + exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1 + + echo "Done. Date: $(date). Results:" + local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +fi + +if [ $stage -le 11 ] && $decode_chain; then + echo "$(date) stage 11: decoding chain alignment setup..." + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1; + + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" --beam 12 \ + exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1 + + echo "Done. Date: $(date). Results:" + local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a +fi diff --git a/egs/yomdle_korean/v1/steps b/egs/yomdle_korean/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/yomdle_korean/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/yomdle_korean/v1/utils b/egs/yomdle_korean/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/yomdle_korean/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/yomdle_tamil/README.txt b/egs/yomdle_tamil/README.txt new file mode 100644 index 00000000000..0f295e5ae5f --- /dev/null +++ b/egs/yomdle_tamil/README.txt @@ -0,0 +1,3 @@ +This directory contains example scripts for OCR on the Yomdle and Slam datasets. +Training is done on the Yomdle dataset and testing is done on Slam. +LM rescoring is also done with extra corpus data obtained from various sources. diff --git a/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py b/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py index 8f03be874e7..885f18c7deb 100755 --- a/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py +++ b/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py @@ -29,7 +29,8 @@ from scipy.spatial import ConvexHull from PIL import Image from scipy.misc import toimage - +from pathlib import Path +from glob import glob parser = argparse.ArgumentParser(description="Creates line images from page image") parser.add_argument('image_dir', type=str, help='Path to full page images') parser.add_argument('csv_dir', type=str, help='Path to csv files') @@ -115,7 +116,7 @@ def bounding_area(index, hull): return {'area': len_p * len_o, 'length_parallel': len_p, 'length_orthogonal': len_o, - 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'rectangle_center': (min_p + float(len_p) / 2, min_o + float(len_o) / 2), 'unit_vector': unit_vector_p, } @@ -220,8 +221,8 @@ def get_center(im): ------- (int, int): center of the image """ - center_x = im.size[0] / 2 - center_y = im.size[1] / 2 + center_x = float(im.size[0]) / 2 + center_y = float(im.size[1]) / 2 return int(center_x), int(center_y) @@ -321,10 +322,18 @@ def update_minimum_bounding_box_input(bounding_box_input): ### main ### globvar = 0 text_fh = open(args.output_file, 'w', encoding='utf-8') -for filename in sorted(os.listdir(args.csv_dir)): - with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f: - image_file = os.path.join(args.image_dir, filename.split('.')[0] + args.ext) - im = Image.open(image_file).convert('L') +file_list = list(Path(args.csv_dir).rglob("*.[cC][sS][vV]")) +for filename in sorted(file_list): + filename = str(filename) + with open(str(filename), 'r', encoding='utf-8') as f: + base_name = os.path.basename(filename) + image_file = os.path.join(args.image_dir, base_name.split('.')[0] + args.ext) + try: + im = Image.open(image_file).convert('L') + except Exception as e: + print("Error: No such Image " + row[1]) + globvar += 1 + continue im = pad_image(im) for row in itertools.islice(csv.reader(f), 1, None): points = [] diff --git a/egs/yomdle_tamil/v1/local/yomdle/gedi2csv_enriched.py b/egs/yomdle_tamil/v1/local/yomdle/gedi2csv_enriched.py index 1c9ab618a78..51d7a34e7e8 100755 --- a/egs/yomdle_tamil/v1/local/yomdle/gedi2csv_enriched.py +++ b/egs/yomdle_tamil/v1/local/yomdle/gedi2csv_enriched.py @@ -39,7 +39,7 @@ def npbox2string(npar): # cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) # Get 4 corners of the rectangle using cv2.boxPoints() -class GEDI2CSV(): +class GEDI2CSV(object): ''' Initialize the extractor''' def __init__(self, logger, args): self._logger = logger diff --git a/egs/yomdle_tamil/v1/local/yomdle/yomdle2csv.py b/egs/yomdle_tamil/v1/local/yomdle/yomdle2csv.py index 49fc41aa5cc..d75b8bcbe8b 100755 --- a/egs/yomdle_tamil/v1/local/yomdle/yomdle2csv.py +++ b/egs/yomdle_tamil/v1/local/yomdle/yomdle2csv.py @@ -43,7 +43,7 @@ def npbox2string(npar): # cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) # Get 4 corners of the rectangle using cv2.boxPoints() -class GEDI2CSV(): +class GEDI2CSV(object): ''' Initialize the extractor''' def __init__(self, logger, args): diff --git a/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py b/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py index 77a6791d5d7..7135bb1b242 100755 --- a/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py +++ b/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py @@ -110,7 +110,7 @@ def bounding_area(index, hull): return {'area': len_p * len_o, 'length_parallel': len_p, 'length_orthogonal': len_o, - 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'rectangle_center': (min_p + float(len_p) / 2, min_o + float(len_o) / 2), 'unit_vector': unit_vector_p, } @@ -275,8 +275,8 @@ def get_center(im): ------- (int, int): center of the image """ - center_x = im.size[0] / 2 - center_y = im.size[1] / 2 + center_x = float(im.size[0]) / 2 + center_y = float(im.size[1]) / 2 return int(center_x), int(center_y) diff --git a/egs/yomdle_zh/v1/local/gedi2csv.py b/egs/yomdle_zh/v1/local/gedi2csv.py index 43a07421dd1..0b80c2e80bb 100755 --- a/egs/yomdle_zh/v1/local/gedi2csv.py +++ b/egs/yomdle_zh/v1/local/gedi2csv.py @@ -55,7 +55,7 @@ def npbox2string(npar): # cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) # Get 4 corners of the rectangle using cv2.boxPoints() -class GEDI2CSV(): +class GEDI2CSV(object): """ Initialize the extractor""" def __init__(self, logger, args): diff --git a/egs/yomdle_zh/v1/local/yomdle2csv.py b/egs/yomdle_zh/v1/local/yomdle2csv.py index 3641de90324..8f208e2d968 100755 --- a/egs/yomdle_zh/v1/local/yomdle2csv.py +++ b/egs/yomdle_zh/v1/local/yomdle2csv.py @@ -55,7 +55,7 @@ def npbox2string(npar): # cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) # Get 4 corners of the rectangle using cv2.boxPoints() -class GEDI2CSV(): +class GEDI2CSV(object): """ Initialize the extractor""" def __init__(self, logger, args): diff --git a/misc/maintenance/cpplint.py b/misc/maintenance/cpplint.py index 03d0569ab1c..91658705f41 100755 --- a/misc/maintenance/cpplint.py +++ b/misc/maintenance/cpplint.py @@ -83,6 +83,7 @@ We do a small hack, which is to ignore //'s with "'s after them on the same line, but it is far from perfect (in either direction). """ +from __future__ import division import codecs import getopt @@ -564,7 +565,7 @@ def IncrementErrorCount(self, category): def PrintErrorCounts(self): """Print a summary of errors by category, and the total.""" - for category, count in self.errors_by_category.iteritems(): + for category, count in self.errors_by_category.items(): sys.stderr.write('Category \'%s\' errors found: %d\n' % (category, count)) sys.stderr.write('Total errors found: %d\n' % self.error_count) @@ -656,7 +657,7 @@ def Check(self, error, filename, linenum): trigger = base_trigger * 2**_VerboseLevel() if self.lines_in_function > trigger: - error_level = int(math.log(self.lines_in_function / base_trigger, 2)) + error_level = int(math.log(float(self.lines_in_function) / base_trigger, 2)) # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ... if error_level > 5: error_level = 5 @@ -676,7 +677,7 @@ class _IncludeError(Exception): pass -class FileInfo: +class FileInfo(object): """Provides utility functions for filenames. FileInfo provides easy access to the components of a file's path @@ -1012,7 +1013,7 @@ def CheckForCopyright(filename, lines, error): # We'll say it should occur by line 10. Don't forget there's a # dummy line at the front. - for line in xrange(1, min(len(lines), 11)): + for line in range(1, min(len(lines), 11)): if re.search(r'Copyright', lines[line], re.I): break else: # means no copyright line was found error(filename, 0, 'legal/copyright', 5, @@ -1604,7 +1605,7 @@ def CheckForFunctionLengths(filename, clean_lines, linenum, if starting_func: body_found = False - for start_linenum in xrange(linenum, clean_lines.NumLines()): + for start_linenum in range(linenum, clean_lines.NumLines()): start_line = lines[start_linenum] joined_line += ' ' + start_line.lstrip() if Search(r'(;|})', start_line): # Declarations and trivial functions @@ -2073,7 +2074,7 @@ def GetLineWidth(line): The width of the line in column positions, accounting for Unicode combining characters and wide characters. """ - if isinstance(line, unicode): + if isinstance(line, str): width = 0 for c in unicodedata.normalize('NFC', line): if unicodedata.east_asian_width(c) in ('W', 'F'): @@ -2861,7 +2862,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, required = {} # A map of header name to linenumber and the template entity. # Example of required: { '': (1219, 'less<>') } - for linenum in xrange(clean_lines.NumLines()): + for linenum in range(clean_lines.NumLines()): line = clean_lines.elided[linenum] if not line or line[0] == '#': continue @@ -2994,7 +2995,7 @@ def ProcessFileData(filename, file_extension, lines, error): RemoveMultiLineComments(filename, lines, error) clean_lines = CleansedLines(lines) - for line in xrange(clean_lines.NumLines()): + for line in range(clean_lines.NumLines()): ProcessLine(filename, file_extension, clean_lines, line, include_state, function_state, class_state, error) class_state.CheckFinished(filename, error) diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py index e8c6bd8a2f4..333ed8dbfc7 100755 --- a/scripts/rnnlm/get_best_model.py +++ b/scripts/rnnlm/get_best_model.py @@ -3,14 +3,14 @@ # Copyright 2017 Johns Hopkins University (author: Daniel Povey) # License: Apache 2.0. -import os import argparse -import sys +import glob import re +import sys parser = argparse.ArgumentParser(description="Works out the best iteration of RNNLM training " - "based on dev-set perplexity, and prints the number corresponding " - "to that iteration", + "based on dev-set perplexity, and prints the number corresponding " + "to that iteration", epilog="E.g. " + sys.argv[0] + " exp/rnnlm_a", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -19,8 +19,7 @@ args = parser.parse_args() - -num_iters=None +num_iters = None try: with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f: for line in f: @@ -36,15 +35,15 @@ sys.exit(sys.argv[0] + ": could not get num_iters from {0}/info.txt".format( args.rnnlm_dir)) -best_objf=-2000 -best_iter=-1 +best_objf = -2000 +best_iter = -1 for i in range(1, num_iters): this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i) try: f = open(this_logfile, 'r', encoding='latin-1') except: sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile)) - this_objf=-1000 + this_objf = -1000 for line in f: m = re.search('Overall objf .* (\S+)$', str(line)) if m is not None: @@ -53,6 +52,10 @@ except Exception as e: sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format( this_logfile, line, str(e))) + # verify this iteration still has model files present + if len(glob.glob("{0}/{1}.raw".format(args.rnnlm_dir, i))) == 0: + # this iteration has log files, but model files have been cleaned up, skip it + continue if this_objf == -1000: print(sys.argv[0] + ": warning: could not parse objective function from {0}".format( this_logfile), file=sys.stderr) @@ -63,5 +66,4 @@ if best_iter == -1: sys.exit(sys.argv[0] + ": error: could not get best iteration.") - print(str(best_iter)) diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py index a5ddb8c25f3..63eaf307498 100755 --- a/scripts/rnnlm/get_embedding_dim.py +++ b/scripts/rnnlm/get_embedding_dim.py @@ -101,4 +101,4 @@ "nnet '{0}': {1} != {2}".format( args.nnet, input_dim, output_dim)) -print(str(input_dim)) +print('{}'.format(input_dim)) diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py new file mode 100644 index 00000000000..40cbee7a496 --- /dev/null +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Tilde +# License: Apache 2.0 + +import sys + +import argparse +import os +import re +import glob + +script_name = sys.argv[0] + +parser = argparse.ArgumentParser(description="Removes models from past training iterations of " + "RNNLM. Can use either 'keep_latest' (default) or " + "'keep_best' cleanup strategy, where former keeps " + "the models that are freshest, while latter keeps " + "the models with best training objective score on " + "dev set.", + epilog="E.g. " + script_name + " exp/rnnlm_a --keep_best", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + +parser.add_argument("rnnlm_dir", + help="Directory where the RNNLM has been trained") +parser.add_argument("--iters_to_keep", + help="Max number of iterations to keep", + type=int, + default=3) +parser.add_argument("--keep_latest", + help="Keeps the training iterations that are latest by age", + action="store_const", + const=True, + default=False) +parser.add_argument("--keep_best", + help="Keeps the training iterations that have the best objf", + action="store_const", + const=True, + default=False) + +args = parser.parse_args() + +# validate arguments +if args.keep_latest and args.keep_best: + sys.exit(script_name + ": can only use one of 'keep_latest' or 'keep_best', but not both") +elif not args.keep_latest and not args.keep_best: + sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'") + + +class IterationInfo: + def __init__(self, model_files, objf, compute_prob_done): + self.model_files = model_files + self.objf = objf + self.compute_prob_done = compute_prob_done + + def __str__(self): + return "{model_files: %s, compute_prob: %s, objf: %2.3f}" % (self.model_files, + self.compute_prob_done, + self.objf) + + def __repr__(self): + return self.__str__() + + +def get_compute_prob_info(log_file): + # we want to know 3 things: iteration number, objf and whether compute prob is done + iteration = int(log_file.split(".")[-2]) + objf = -2000 + compute_prob_done = False + # roughly based on code in get_best_model.py + try: + f = open(log_file, "r", encoding="latin-1") + except: + print(script_name + ": warning: compute_prob log not found for iteration " + + str(iter) + ". Skipping", + file=sys.stderr) + return iteration, objf, compute_prob_done + for line in f: + objf_m = re.search('Overall objf .* (\S+)$', str(line)) + if objf_m is not None: + try: + objf = float(objf_m.group(1)) + except Exception as e: + sys.exit(script_name + ": line in file {0} could not be parsed: {1}, error is: {2}".format( + log_file, line, str(e))) + if "# Ended" in line: + compute_prob_done = True + if objf == -2000: + print(script_name + ": warning: could not parse objective function from " + log_file, file=sys.stderr) + return iteration, objf, compute_prob_done + + +def get_iteration_files(exp_dir): + iterations = dict() + compute_prob_logs = glob.glob(exp_dir + "/log/compute_prob.[0-9]*.log") + for log in compute_prob_logs: + iteration, objf, compute_prob_done = get_compute_prob_info(log) + if iteration == 0: + # iteration 0 is special, never consider it for cleanup + continue + if compute_prob_done: + # this iteration can be safely considered for cleanup + # gather all model files belonging to it + model_files = [] + # when there are multiple jobs per iteration, there can be several model files + # we need to potentially clean them all up without mixing them up + model_files.extend(glob.glob("{0}/word_embedding.{1}.mat".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/word_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/feat_embedding.{1}.mat".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/feat_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/{1}.raw".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/{1}.[0-9]*.raw".format(exp_dir, iteration))) + # compute_prob logs outlive model files, only consider iterations that do still have model files + if len(model_files) > 0: + iterations[iteration] = IterationInfo(model_files, objf, compute_prob_done) + return iterations + + +def remove_model_files_for_iter(iter_info): + for f in iter_info.model_files: + os.remove(f) + + +def keep_latest(iteration_dict): + max_to_keep = args.iters_to_keep + kept = 0 + iterations_in_reverse_order = reversed(sorted(iteration_dict)) + for iter in iterations_in_reverse_order: + if kept < max_to_keep: + kept += 1 + else: + remove_model_files_for_iter(iteration_dict[iter]) + + +def keep_best(iteration_dict): + iters_to_keep = args.iters_to_keep + best = [] + for iter, iter_info in iteration_dict.items(): + objf = iter_info.objf + if objf == -2000: + print(script_name + ": warning: objf unavailable for iter " + str(iter), file=sys.stderr) + continue + # add potential best, sort by objf, trim to iters_to_keep size + best.append((iter, objf)) + best = sorted(best, key=lambda x: -x[1]) + if len(best) > iters_to_keep: + throwaway = best[iters_to_keep:] + best = best[:iters_to_keep] + # remove iters that we know are not the best + for (iter, _) in throwaway: + remove_model_files_for_iter(iteration_dict[iter]) + + +# grab all the iterations mapped to their model files, objf score and compute_prob status +iterations = get_iteration_files(args.rnnlm_dir) +# apply chosen cleanup strategy +if args.keep_latest: + keep_latest(iterations) +else: + keep_best(iterations) diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh index aedfc470ac9..d6d38f3d734 100755 --- a/scripts/rnnlm/train_rnnlm.sh +++ b/scripts/rnnlm/train_rnnlm.sh @@ -38,6 +38,11 @@ num_egs_threads=10 # number of threads used for sampling, if we're using use_gpu=true # use GPU for training use_gpu_for_diagnostics=false # set true to use GPU for compute_prob_*.log +# optional cleanup options +cleanup=false # add option --cleanup true to enable automatic cleanup of old models +cleanup_strategy="keep_latest" # determines cleanup strategy, use either "keep_latest" or "keep_best" +cleanup_keep_iters=3 # number of iterations that will have their models retained + trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM . utils/parse_options.sh @@ -222,12 +227,16 @@ while [ $x -lt $num_iters ]; do nnet3-average $src_models $dir/$[x+1].raw '&&' \ matrix-sum --average=true $src_matrices $dir/${embedding_type}_embedding.$[x+1].mat fi + # optionally, perform cleanup after training + if [ "$cleanup" = true ] ; then + python3 rnnlm/rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters + fi ) - # the error message below is not that informative, but $cmd will # have printed a more specific one. [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1; fi + x=$[x+1] num_splits_processed=$[num_splits_processed+this_num_jobs] done diff --git a/src/Makefile b/src/Makefile index 6dfd146e3d5..1b37ebce745 100644 --- a/src/Makefile +++ b/src/Makefile @@ -36,6 +36,7 @@ all: $(MAKE) kaldi.mk $(MAKE) mklibdir $(MAKE) subdirs + $(MAKE) -C matrix test -echo Done subdirs: $(SUBDIRS) diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h index ca476033950..6c2b690f54c 100644 --- a/src/base/io-funcs.h +++ b/src/base/io-funcs.h @@ -31,7 +31,9 @@ #include #include #include + #include "base/kaldi-common.h" +#include "base/io-funcs-inl.h" namespace kaldi { @@ -235,7 +237,4 @@ inline void InitKaldiOutputStream(std::ostream &os, bool binary); inline bool InitKaldiInputStream(std::istream &is, bool *binary); } // end namespace kaldi. - -#include "base/io-funcs-inl.h" - #endif // KALDI_BASE_IO_FUNCS_H_ diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc index ad1dd41a53f..c9be5586933 100644 --- a/src/bin/draw-tree.cc +++ b/src/bin/draw-tree.cc @@ -18,6 +18,7 @@ // limitations under the License. #include "tree/tree-renderer.h" +#include "tree/context-dep.h" void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms, kaldi::EventType **query) diff --git a/src/chainbin/chain-get-supervision.cc b/src/chainbin/chain-get-supervision.cc index 6090d9f0058..1ac89d4630b 100644 --- a/src/chainbin/chain-get-supervision.cc +++ b/src/chainbin/chain-get-supervision.cc @@ -22,6 +22,7 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" #include "chain/chain-supervision.h" +#include "tree/context-dep.h" namespace kaldi { namespace chain { diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index a3222d2285f..b534316bf7f 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -72,7 +72,7 @@ double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, void UpdateNnetMovingAverage(int32 num_models, const Nnet &nnet, Nnet *moving_average_nnet) { KALDI_ASSERT(NumParameters(nnet) == NumParameters(*moving_average_nnet)); - ScaleNnet((num_models - 1.0) / num_models, moving_average_nnet); + ScaleNnetForAverage((num_models - 1.0) / num_models, moving_average_nnet); AddNnet(nnet, 1.0 / num_models, moving_average_nnet); } @@ -117,7 +117,7 @@ int main(int argc, char *argv[]) { po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); po.Register("batchnorm-test-mode", &batchnorm_test_mode, - "If true, set test-mode to true on any BatchNormComponents " + "If true, set test-mode to true on any BatchNormComponents and BatchRenormComponents" "while evaluating objectives."); po.Register("dropout-test-mode", &dropout_test_mode, "If true, set test-mode to true on any DropoutComponents and " diff --git a/src/configure b/src/configure index c4a1445efbd..b94731da918 100755 --- a/src/configure +++ b/src/configure @@ -558,66 +558,23 @@ function linux_check_static { fi } -function linux_configure_debian_ubuntu { - m=$1 - ATLASLIBS="/usr/lib$m/atlas-base/libatlas.so.3gf /usr/lib$m/atlas-base/libf77blas.so.3gf /usr/lib$m/atlas-base/libcblas.so.3gf /usr/lib$m/atlas-base/liblapack_atlas.so.3gf" - for f in $ATLASLIBS; do - [ ! -f $f ] && return 1; - done - lapacklib=$(echo $ATLASLIBS | awk '{print $NF}') - if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then - exit 1; - fi - echo ATLASINC = $ATLASROOT/include >> kaldi.mk - echo ATLASLIBS = $ATLASLIBS >> kaldi.mk - echo >> kaldi.mk - if [[ "$TARGET_ARCH" == arm* ]]; then - cat makefiles/linux_atlas_arm.mk >> kaldi.mk - elif [[ "$TARGET_ARCH" == ppc64le ]]; then - cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk - else - cat makefiles/linux_atlas.mk >> kaldi.mk - fi - echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && configure_cuda - linux_configure_speex -} - -function linux_configure_debian_ubuntu3 { - ATLASLIBS="/usr/lib/libatlas.so.3 /usr/lib/libf77blas.so.3 /usr/lib/libcblas.so.3 /usr/lib/liblapack_atlas.so.3" - for f in $ATLASLIBS; do - [ ! -f $f ] && return 1; - done - lapacklib=$(echo $ATLASLIBS | awk '{print $NF}') - if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then - exit 1; - fi - echo ATLASINC = $ATLASROOT/include >> kaldi.mk - echo ATLASLIBS = $ATLASLIBS >> kaldi.mk - echo >> kaldi.mk - if [[ "$TARGET_ARCH" == arm* ]]; then - cat makefiles/linux_atlas_arm.mk >> kaldi.mk - elif [[ "$TARGET_ARCH" == ppc64le ]]; then - cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk - else - cat makefiles/linux_atlas.mk >> kaldi.mk - fi - echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && configure_cuda - linux_configure_speex -} - -function linux_configure_debian7 { - ATLASLIBS="/usr/lib/atlas-base/libatlas.so.3.0 /usr/lib/atlas-base/libf77blas.so.3.0 /usr/lib/atlas-base/libcblas.so.3 /usr/lib/atlas-base/liblapack_atlas.so.3" +function linux_configure_atlas_generic { + # You pass in a directory (e.g. /usr/lib/atlas-base) and a suffix (e.g. so.3.0) + # and it tries to find ATLAS libraries with that dir and suffix. On success it + # returns 0; on failure, it returns 1. + dir=$1 + suffix=$2 + ATLASLIBS="$dir/libatlas.$suffix $dir/libf77blas.$suffix $dir/libcblas.$suffix $dir/liblapack_atlas.$suffix" for f in $ATLASLIBS; do [ ! -f $f ] && return 1; done lapacklib=$(echo $ATLASLIBS | awk '{print $NF}') if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then + echo "configure: failed to find symbol ATL_cgetrf in library $lapacklib" exit 1; fi libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}')) - [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_debian7" && exit 1; + [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_atlas_generic: dir=$dir,suffix=$suffix" && exit 1; echo ATLASINC = $ATLASROOT/include >> kaldi.mk echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk echo >> kaldi.mk @@ -628,33 +585,11 @@ function linux_configure_debian7 { else cat makefiles/linux_atlas.mk >> kaldi.mk fi - echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS" + echo "Successfully configured ATLAS with ATLASLIBS=$ATLASLIBS" $use_cuda && configure_cuda linux_configure_speex } -function linux_configure_redhat { - m=$1 # 64 or empty. - ATLASLIBS="/usr/lib$m/atlas/libatlas.so.3 /usr/lib$m/atlas/libf77blas.so.3 /usr/lib$m/atlas/libcblas.so.3 /usr/lib$m/atlas/libclapack.so.3" - for f in $ATLASLIBS; do - [ ! -f $f ] && return 1; - done - libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}')) - [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat" && exit 1; - echo ATLASINC = $ATLASROOT/include >> kaldi.mk - echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk - echo >> kaldi.mk - if [[ "$TARGET_ARCH" == arm* ]]; then - cat makefiles/linux_atlas_arm.mk >> kaldi.mk - elif [[ "$TARGET_ARCH" == ppc64le ]]; then - cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk - else - cat makefiles/linux_atlas.mk >> kaldi.mk - fi - echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && configure_cuda -} - function linux_configure_redhat_fat { # This is for when only two so-called 'fat' ATLAS libs are provided: # libsatlas.so.3 and libtatlas.so.3. @@ -680,7 +615,7 @@ function linux_configure_redhat_fat { $use_cuda && configure_cuda } -function linux_configure_static { +function linux_configure_atlas_static { if $threaded_atlas; then pt=pt; else pt=""; fi if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below. @@ -699,11 +634,11 @@ function linux_configure_static { echo "Validating presence of ATLAS libs in $ATLASLIBDIR" ATLASLIBS= # The Lapack part of ATLAS seems to appear under various different names.. but it - # should always have symbols like ATL_cgetrf defined, so we test for this, - # for all the names we have encountered. + # should always have symbols like ATL_cgetrf and clapack_cgetrf defined, so we test for this. for libname in liblapack liblapack_atlas libclapack; do if [ -f $ATLASLIBDIR/${libname}.a -a "$ATLASLIBS" == "" ]; then - if nm $ATLASLIBDIR/${libname}.a | grep ATL_cgetrf >/dev/null; then + if nm $ATLASLIBDIR/${libname}.a | grep ATL_cgetrf >/dev/null && \ + nm $ATLASLIBDIR/${libname}.a | grep clapack_cgetrf >/dev/null; then ATLASLIBS=$ATLASLIBDIR/${libname}.a echo "Using library $ATLASLIBS as ATLAS's CLAPACK library." fi @@ -782,11 +717,11 @@ function linux_configure_dynamic { echo "Validating presence of ATLAS libs in $ATLASLIBDIR" ATLASLIBS= # The Lapack part of ATLAS seems to appear under various different names.. but it - # should always have symbols like ATL_cgetrf defined, so we test for this, - # for all the names we have encountered. + # should always have symbols like clapack_cgetrf and ATL_cgetrf defined, so we test for this. for libname in lapack lapack_atlas clapack; do if [ -f $ATLASLIBDIR/lib${libname}.so -a "$ATLASLIBS" == "" ]; then - if nm --dynamic $ATLASLIBDIR/lib${libname}.so | grep ATL_cgetrf >/dev/null; then + if nm --dynamic $ATLASLIBDIR/lib${libname}.so | grep clapack_cgetrf >/dev/null && \ + nm --dynamic $ATLASLIBDIR/lib${libname}.so | grep ATL_cgetrf >/dev/null; then ATLASLIBS="$ATLASLIBDIR/lib${libname}.so" echo "Using library $ATLASLIBS as ATLAS's CLAPACK library." fi @@ -1229,33 +1164,18 @@ elif [ "`uname`" == "Linux" ]; then # containing {liblapack.a,libblas.a}, and linking against just these two # libraries worked. - if $static_math; then - # Prefer static to dynamic math. - linux_configure_static || \ - linux_configure_debian_ubuntu3 || \ - linux_configure_dynamic || \ - linux_configure_debian_ubuntu 64 || \ - linux_configure_debian_ubuntu || \ - linux_configure_debian7 || \ - linux_configure_redhat 64 || \ - linux_configure_redhat || \ - linux_configure_redhat_fat 64 || \ - linux_configure_redhat_fat || \ - linux_atlas_failure "Failed to configure ATLAS libraries"; - else - # Prefer dynamic to static math. - linux_configure_debian_ubuntu3 || \ - linux_configure_dynamic || \ - linux_configure_static || \ - linux_configure_debian_ubuntu 64 || \ - linux_configure_debian_ubuntu || \ - linux_configure_debian7 || \ - linux_configure_redhat 64 || \ - linux_configure_redhat || \ - linux_configure_redhat_fat 64 || \ - linux_configure_redhat_fat || \ - linux_atlas_failure "Failed to configure ATLAS libraries"; - fi + ( $static_math && linux_configure_atlas_static ) || \ + linux_configure_atlas_generic /usr/lib "so.3" || \ + linux_configure_atlas_generic /usr/lib/atlas-base "so.3gf" || \ + linux_configure_atlas_generic /usr/lib64/atlas-base "so.3gf" \ + linux_configure_atlas_generic /usr/lib/atlas "so.3" || \ + linux_configure_atlas_generic /usr/lib64/atlas "so.3" || \ + linux_configure_atlas_generic /usr/lib/x86_64-linux-gnu/ "so.3" || \ + linux_configure_atlas_generic /usr/lib/x86_64-linux-gnu/ "so" || \ + linux_configure_redhat_fat 64 || \ + linux_configure_redhat_fat || \ + linux_configure_atlas_static || \ + linux_atlas_failure "Failed to configure ATLAS libraries"; elif [ "$MATHLIB" == "MKL" ]; then if [ "$TARGET_ARCH" != "x86_64" ]; then diff --git a/src/decoder/decodable-matrix.h b/src/decoder/decodable-matrix.h index f32a007e6ca..475638a35af 100644 --- a/src/decoder/decodable-matrix.h +++ b/src/decoder/decodable-matrix.h @@ -26,6 +26,7 @@ #include "base/kaldi-common.h" #include "hmm/transition-model.h" #include "itf/decodable-itf.h" +#include "matrix/kaldi-matrix.h" namespace kaldi { @@ -241,8 +242,6 @@ class DecodableMatrixScaled: public DecodableInterface { BaseFloat scale_; KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaled); }; - - } // namespace kaldi #endif // KALDI_DECODER_DECODABLE_MATRIX_H_ diff --git a/src/decoder/lattice-faster-online-decoder.h b/src/decoder/lattice-faster-online-decoder.h index e56f24a2474..69bf8b6d98d 100644 --- a/src/decoder/lattice-faster-online-decoder.h +++ b/src/decoder/lattice-faster-online-decoder.h @@ -90,7 +90,7 @@ class LatticeFasterOnlineDecoderTpl: /// Outputs an FST corresponding to the single best path through the lattice. /// This is quite efficient because it doesn't get the entire raw lattice and find - /// the best path through it; insterad, it uses the BestPathEnd and BestPathIterator + /// the best path through it; instead, it uses the BestPathEnd and BestPathIterator /// so it basically traces it back through the lattice. /// Returns true if result is nonempty (using the return status is deprecated, /// it will become void). If "use_final_probs" is true AND we reached the diff --git a/src/decoder/training-graph-compiler.h b/src/decoder/training-graph-compiler.h index 77c5735687f..ee56c6dfb3d 100644 --- a/src/decoder/training-graph-compiler.h +++ b/src/decoder/training-graph-compiler.h @@ -24,6 +24,7 @@ #include "hmm/transition-model.h" #include "fst/fstlib.h" #include "fstext/fstext-lib.h" +#include "tree/context-dep.h" namespace kaldi { diff --git a/src/doc/grammar.dox b/src/doc/grammar.dox index 80000c0b067..d1c6f51f349 100644 --- a/src/doc/grammar.dox +++ b/src/doc/grammar.dox @@ -336,7 +336,7 @@ Z_S 243 that consume CLG.fst always also consume the ilabel_info, which is a vector >. For a particular ilabel, say 1536, ilabel_info[1536] = { 5, 21 } is a vector of integers representing a phone-in-context. E.g. this would represent the phone 21 with a left-context of 5. - Disambiguation symbols also appear on the input of CLG.fst, and they are are represented in the ilabel_info + Disambiguation symbols also appear on the input of CLG.fst, and they are represented in the ilabel_info a 1-dimensional vector like { -104 } containing the negative of the disambiguation symbol's integer id. @@ -352,7 +352,7 @@ Z_S 243 The special symbols in CLG.fst will be as follows. The following special symbols may appear in any CLG graph, top-level or not: - - When any graph invokes a sub-graph, there will ben arc with an ilabel + - When any graph invokes a sub-graph, there will be n arc with an ilabel (\#nonterm:foo, left-context-phone) representing the user-specified nonterminal and the actual left-context, which will be followed by arcs with ilabels of the form (\#nonterm_reenter, diff --git a/src/gmmbin/gmm-init-biphone.cc b/src/gmmbin/gmm-init-biphone.cc index e5cc182f94c..42a9d1a91a0 100644 --- a/src/gmmbin/gmm-init-biphone.cc +++ b/src/gmmbin/gmm-init-biphone.cc @@ -22,6 +22,7 @@ #include "util/common-utils.h" #include "gmm/am-diag-gmm.h" #include "tree/event-map.h" +#include "tree/context-dep.h" #include "hmm/hmm-topology.h" #include "hmm/transition-model.h" diff --git a/src/gmmbin/gmm-init-mono.cc b/src/gmmbin/gmm-init-mono.cc index 0aac769eb70..3c370c36515 100644 --- a/src/gmmbin/gmm-init-mono.cc +++ b/src/gmmbin/gmm-init-mono.cc @@ -23,6 +23,7 @@ #include "gmm/am-diag-gmm.h" #include "hmm/hmm-topology.h" #include "hmm/transition-model.h" +#include "tree/context-dep.h" namespace kaldi { // This function reads a file like: diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h index 495ebf278ae..4faaa92fa66 100644 --- a/src/hmm/hmm-test-utils.h +++ b/src/hmm/hmm-test-utils.h @@ -24,6 +24,7 @@ #include "hmm/hmm-topology.h" #include "hmm/transition-model.h" #include "lat/kaldi-lattice.h" +#include "tree/context-dep.h" namespace kaldi { diff --git a/src/hmm/hmm-topology.h b/src/hmm/hmm-topology.h index edea02998c0..750d35bcfe4 100644 --- a/src/hmm/hmm-topology.h +++ b/src/hmm/hmm-topology.h @@ -21,7 +21,6 @@ #define KALDI_HMM_HMM_TOPOLOGY_H_ #include "base/kaldi-common.h" -#include "tree/context-dep.h" #include "util/const-integer-set.h" diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h index 0c255845dd5..e153c249740 100644 --- a/src/hmm/posterior.h +++ b/src/hmm/posterior.h @@ -24,7 +24,6 @@ #define KALDI_HMM_POSTERIOR_H_ #include "base/kaldi-common.h" -#include "tree/context-dep.h" #include "util/const-integer-set.h" #include "util/kaldi-table.h" #include "hmm/transition-model.h" diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h index f03b54e8b71..e453c24f9cb 100644 --- a/src/hmm/transition-model.h +++ b/src/hmm/transition-model.h @@ -22,11 +22,12 @@ #define KALDI_HMM_TRANSITION_MODEL_H_ #include "base/kaldi-common.h" -#include "tree/context-dep.h" #include "util/const-integer-set.h" #include "fst/fst-decl.h" // forward declarations. #include "hmm/hmm-topology.h" #include "itf/options-itf.h" +#include "itf/context-dep-itf.h" +#include "matrix/kaldi-vector.h" namespace kaldi { diff --git a/src/lat/minimize-lattice.h b/src/lat/minimize-lattice.h index fcf6c0f36df..eb13fc1c851 100644 --- a/src/lat/minimize-lattice.h +++ b/src/lat/minimize-lattice.h @@ -28,7 +28,6 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" #include "fstext/fstext-lib.h" -#include "hmm/transition-model.h" #include "lat/kaldi-lattice.h" namespace fst { diff --git a/src/lat/push-lattice.h b/src/lat/push-lattice.h index e782aadc0f3..080bb637604 100644 --- a/src/lat/push-lattice.h +++ b/src/lat/push-lattice.h @@ -28,7 +28,6 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" #include "fstext/fstext-lib.h" -#include "hmm/transition-model.h" #include "lat/kaldi-lattice.h" namespace fst { diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk index ee0f3c2e90b..7af6497abec 100644 --- a/src/makefiles/default_rules.mk +++ b/src/makefiles/default_rules.mk @@ -27,10 +27,16 @@ endif all: $(LIBFILE) $(BINFILES) -$(LIBFILE): $(OBJFILES) + +ifdef LIBNAME + +$(LIBNAME).a: $(OBJFILES) $(AR) -cr $(LIBNAME).a $(OBJFILES) $(RANLIB) $(LIBNAME).a + ifeq ($(KALDI_FLAVOR), dynamic) +# the LIBFILE is not the same as $(LIBNAME).a +$(LIBFILE): $(LIBNAME).a ifeq ($(shell uname), Darwin) $(CXX) -dynamiclib -o $@ -install_name @rpath/$@ $(LDFLAGS) $(OBJFILES) $(LDLIBS) ln -sf $(shell pwd)/$@ $(KALDILIBDIR)/$@ @@ -41,7 +47,8 @@ ifeq ($(KALDI_FLAVOR), dynamic) else # Platform not supported $(error Dynamic libraries not supported on this platform. Run configure with --static flag.) endif -endif +endif # ifeq ($(KALDI_FLAVOR), dynamic) +endif # ifdef LIBNAME # By default (GNU) make uses the C compiler $(CC) for linking object files even # if they were compiled from a C++ source. Below redefinition forces make to diff --git a/src/matrix/kaldi-blas.h b/src/matrix/kaldi-blas.h index 5d25ab852bd..8a06540bba2 100644 --- a/src/matrix/kaldi-blas.h +++ b/src/matrix/kaldi-blas.h @@ -50,8 +50,8 @@ #ifdef HAVE_ATLAS extern "C" { - #include - #include + #include "cblas.h" + #include "clapack.h" } #elif defined(HAVE_CLAPACK) #ifdef __APPLE__ @@ -74,7 +74,7 @@ // from the tools/CLAPACK_include directory. #include #include - #include + #include // get rid of macros from f2c.h -- these are dangerous. #undef abs @@ -110,7 +110,7 @@ #undef bit_clear #undef bit_set #else - #error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)" + #error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)" #endif #ifdef HAVE_OPENBLAS diff --git a/src/nnet/nnet-trnopts.h b/src/nnet/nnet-trnopts.h index 12ad1b1cbb5..0a064e17fd4 100644 --- a/src/nnet/nnet-trnopts.h +++ b/src/nnet/nnet-trnopts.h @@ -52,7 +52,7 @@ struct NnetTrainOptions { // print for debug purposes friend std::ostream& operator<<(std::ostream& os, const NnetTrainOptions& opts) { - os << "RbmTrainOptions : " + os << "NnetTrainOptions : " << "learn_rate" << opts.learn_rate << ", " << "momentum" << opts.momentum << ", " << "l2_penalty" << opts.l2_penalty << ", " diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc index 6db046796be..5da55d0f70d 100644 --- a/src/nnet3/nnet-batch-compute.cc +++ b/src/nnet3/nnet-batch-compute.cc @@ -135,7 +135,7 @@ NnetBatchComputer::GetHighestPriorityComputation( int32 *minibatch_size_out, std::vector *tasks) { tasks->clear(); - std::unique_lock(mutex_); + std::unique_lock lock(mutex_); MapType::iterator iter = tasks_.begin(), end = tasks_.end(), best_iter = tasks_.end(); double highest_priority = -std::numeric_limits::infinity(); @@ -1094,7 +1094,7 @@ bool NnetBatchDecoder::GetOutput( return false; UtteranceOutput *this_output = pending_utts_.front(); pending_utts_.pop_front(); - if (this_output->compact_lat.NumStates() == 0) { + if (this_output->lat.NumStates() == 0) { delete this_output; // ... and continue round the loop, without returning any output to the // user for this utterance. Something went wrong in decoding: for diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 1ff7daa01d1..c66dc347ce9 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -173,6 +173,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new LstmNonlinearityComponent(); } else if (component_type == "BatchNormComponent") { ans = new BatchNormComponent(); + } else if (component_type == "BatchRenormComponent") { + ans = new BatchRenormComponent(); } else if (component_type == "TimeHeightConvolutionComponent") { ans = new TimeHeightConvolutionComponent(); } else if (component_type == "RestrictedAttentionComponent") { diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h index 333ed3168b9..f96195ff146 100644 --- a/src/nnet3/nnet-compute.h +++ b/src/nnet3/nnet-compute.h @@ -119,7 +119,7 @@ class NnetComputer { // Version of GetOutput that calls Swap(), destroying the output stored inside // this object. You should probably not use this if you plan to call - // Backward() on the same NnetComputer object, or it's a recurret + // Backward() on the same NnetComputer object, or it's a recurrent // computation-- it may lead to a crash. void GetOutputDestructive(const std::string &output_name, CuMatrix *output); diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index d10c6fabd36..0384faf2293 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -3,6 +3,7 @@ // Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey) // 2015 Guoguo Chen // 2015 Daniel Galvez +// 2018 Gaofeng Cheng (Institute of Acoustics, Chinese Academy of Sciences) // See ../../COPYING for clarification regarding multiple authors // @@ -341,7 +342,7 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { y(i) = x(i) - mean var = 1/I \sum_i y(i)^2 - rscale = sqrt(var + epsilon)^power <---- For regular batchnorm, power == -0.5. + rscale = (var + epsilon)^power <---- For regular batchnorm, power == -0.5. z(i) = target-rms * rscale * y(i) @@ -378,7 +379,7 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { mean' = \sum_i y'(i) = (target-rms * rscale * \sum_i z'(i)) + (var_deriv_mod \sum_i z(i)) [... and the 2nd term above is zero when summed over i, because \sum_i z(i) is zero, ...] - = target-rms * rscale * \sum_i z(i) + = target-rms * rscale * \sum_i z'(i) and: x'(i) = z'(i) * target-rms * rscale + z(i) var_deriv_mod - 1/I mean' = z'(i) * target-rms * rscale + z(i) var_deriv_mod - 1/I * target-rms * rscale * \sum_i z'(i) @@ -675,6 +676,645 @@ void BatchNormComponent::ZeroStats() { } } +void BatchRenormComponent::ComputeDerived() { + if (!test_mode_) { + offset_.Resize(0); + scale_.Resize(0); + return; + } + bool compute_prob_zero_iter = false; + + if (count_ == 0.0) { + KALDI_WARN << "Test-mode is set but there is no data count. " + "Creating random counts. This only makes sense " + "in unit-tests (or compute_prob_*.0.log). If you see this " + "elsewhere, something is very wrong."; + count_ = 1.0; + stats_sum_.SetRandn(); + stats_sumsq_.SetRandn(); + stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); + compute_prob_zero_iter = true; + } + offset_.Resize(block_dim_); + scale_.Resize(block_dim_); + if (compute_prob_zero_iter) { + offset_.CopyFromVec(stats_sum_); + offset_.Scale(-1.0 / count_); + // now offset_ is -mean. + scale_.CopyFromVec(stats_sumsq_); + scale_.Scale(1.0 / count_); + scale_.AddVecVec(-1.0, offset_, offset_, 1.0); + // now scale_ is variance. + // Mathematically the ApplyFloor statement should be a no-op; this is in case + // of numerical roundoff. + scale_.ApplyFloor(0.0); + scale_.Add(epsilon_); + BaseFloat power = -0.5; + scale_.ApplyPow(power); + // now scale_ = min(variance, epsilon)^power + // next, multiply by the target RMS (normally 1.0). + scale_.Scale(target_rms_); + offset_.MulElements(scale_); + // now offset_ is -(scale*mean). + } else { + offset_.CopyFromVec(moving_mean_); + scale_.CopyFromVec(moving_stddev_); + scale_.ApplyPow(-1.0); + offset_.MulElements(scale_); + offset_.Scale(-1.0); + } +} + +void BatchRenormComponent::SetTestMode(bool test_mode) { + test_mode_ = test_mode; + ComputeDerived(); +} + +// for batch-renorm, target-rms should be 1.0 +void BatchRenormComponent::Check() const { + KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 && + epsilon_ > 0.0 && target_rms_ == 1.0 && r_max_ > 0 && d_max_ >= 0 && alpha_ >= 0.0); +} + +BatchRenormComponent::BatchRenormComponent(const BatchRenormComponent &other): + dim_(other.dim_), block_dim_(other.block_dim_), + epsilon_(other.epsilon_), target_rms_(other.target_rms_), + test_mode_(other.test_mode_), count_(other.count_), + stats_sum_(other.stats_sum_), stats_sumsq_(other.stats_sumsq_), + training_begining_(other.training_begining_), + r_max_(other.r_max_), d_max_(other.d_max_), average_count_(other.average_count_), + alpha_(other.alpha_), moving_mean_(other.moving_mean_), + moving_stddev_(other.moving_stddev_) { + ComputeDerived(); + Check(); +} + + +std::string BatchRenormComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ + << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ + << ", count=" << count_ + << ", test-mode=" << (test_mode_ ? "true" : "false"); + if (count_ > 0) { + Vector mean(stats_sum_), var(stats_sumsq_); + mean.Scale(1.0 / count_); + var.Scale(1.0 / count_); + // subtract mean^2 from var. + var.AddVecVec(-1.0, mean, mean, 1.0); + var.ApplyFloor(0.0); + var.ApplyPow(0.5); // make it the stddev. + stream << ", data-mean=" << SummarizeVector(mean) + << ", data-stddev=" << SummarizeVector(var); + Vector moving_mean_copy(moving_mean_), moving_stddev_copy(moving_stddev_); + stream << ", moving-mean=" << SummarizeVector(moving_mean_copy) + << ", moving-stddv=" << SummarizeVector(moving_stddev_copy); + } + return stream.str(); +} + +void BatchRenormComponent::InitFromConfig(ConfigLine *cfl) { + dim_ = -1; + block_dim_ = -1; + epsilon_ = 1.0e-03; + target_rms_ = 1.0; + test_mode_ = false; + training_begining_ = true; + r_max_ = 1.0; + d_max_ = 0.0; + alpha_ = 0.01; + + bool ok = cfl->GetValue("dim", &dim_); + cfl->GetValue("block-dim", &block_dim_); + cfl->GetValue("epsilon", &epsilon_); + cfl->GetValue("target-rms", &target_rms_); + cfl->GetValue("test-mode", &test_mode_); + cfl->GetValue("r-max", &r_max_); + cfl->GetValue("d-max", &d_max_); + cfl->GetValue("alpha", &alpha_); + if (!ok || dim_ <= 0) { + KALDI_ERR << "BatchRenormComponent must have 'dim' specified, and > 0"; + } + if (block_dim_ == -1) + block_dim_ = dim_; + if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 && + epsilon_ > 0 && target_rms_ > 0)) + KALDI_ERR << "Invalid configuration in BatchRenormComponent."; + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); + count_ = 0; + average_count_ = 1.0; + stats_sum_.Resize(block_dim_); + stats_sumsq_.Resize(block_dim_); + moving_stddev_.Resize(block_dim_); + moving_mean_.Resize(block_dim_); + if (test_mode_) { + ComputeDerived(); + } +} + + + +/* + BATCH-RENORM_MATH + + This comment describes the equations involved in batch-renorm normalization, and + derives the forward and back-propagation. + + For BatchRenorm we just set target-rms = 1.0. + + This is all dimension-by-dimension, so we just imagine the inputs + are scalars x(i), for i=0 .. n-1. + + FORWARD PASS: + + Let 'power' be a constant, equal to -0.5 for regular batch-renorm. + + To simplify the math we (conceptually, not physically) do the normalization in + two stages: first mean, then variance, so we have x(i) -> y(i) -> z(i). + + The name 'rscale' means 'raw scale', meaning the scale before including + target-rms. Later we'll define 'scale = target-rms * rscale', to make some + of the actual computations slightly more efficient. + It should be noted that we use target-rms = 1.0 for batch-renorm + + clipped_r and clipped_d are the allowed correction terms of batch-renorm, which is + treated as constant for a given training setup. Back-prop is stopped through them. + + Define: mean = 1/I * sum_i x(i) + y(i) = x(i) - mean + + var = 1/I \sum_i y(i)^2 + rscale = clipped_r * (var + epsilon)^power <---- For regular batch-renorm, power == -0.5. + z(i) = rscale * y(i) + clipped_d + + + Most of the rest of this comment derives how to compute the derivatives. If + you just want the formulas, please skip to the string 'BACKWARD PASS' below. + + We'll use a notation where an apostrophe on something means (the derivative of + the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on. + We are given y'(i). Propagating the derivatives backward: + + rscale' = (sum_i y(i) z'(i)) + = (sum_i (z(i) - clipped_d) * z'(i) ) / rscale + + [ note: d(rscale)/d(var) = clipped_r * power * (var + epsilon)^{power - 1} + = clipped_r^(1/power) * power * rscale^{(power-1)/power} ] + + var' = rscale' * clipped_r^(1/power) * power * rscale^{(power-1)/power} + = (sum_i (z(i) - clipped_d) * z'(i) ) / rscale * clipped_r^(1/power) * power * rscale^{(power-1)/power} + = (sum_i (z(i) - clipped_d) * z'(i) ) * clipped_r^(1/power) * power * rscale^(-1 / power) + = clipped_r^(1/power) * (power * (sum_i (z(i) - clipped_d) * z'(i)) * rscale^(-1 / power) + + [note: the following formula is of the form "direct term" + "indirect term"] + y'(i) = z'(i) * rscale + 2/I y(i) var' + + Now, the above is inconvenient because it contains y(i) which is an intermediate + quantity. We reformulate in terms of z(i), using y(i) = (z(i) - clipped_d) / rscale, so: + + defining + var_deriv_mod = 2/I * var' / rscale + = clipped_r^(1/power) * 2/I * power * (sum_i (z(i) - clipped_d) * z'(i)) * rscale^{-(1+power)/power} + we have: + y'(i) = z'(i) * rscale + 2/I y(i) var' + = z'(i) * rscale + 2/I (z(i) - clipped_d) / rscale * var' + = z'(i) * rscale + (z(i) - clipped_d) var_deriv_mod + + Now, + mean' = \sum_i y'(i) + = rscale * \sum_i z'(i) + var_deriv_mod * \sum_i (z(i) - clipped_d) + [\sum_i z(i) = I * clipped_d] + = rscale * \sum_i z'(i) + and: + x'(i) = z'(i) * rscale + (z(i) - clipped_d) * var_deriv_mod - 1/I mean' + = z'(i) * rscale + (z(i) - clipped_d) * var_deriv_mod - 1/I * rscale * \sum_i z'(i) + = rscale * (z'(i) - 1/I * \sum_i z'(i)) + (z(i) - clipped_d) var_deriv_mod + + It will simplify the code if we define: + For batch-renorm, target-rms = 1.0, so scale == rscale. This way, we can write as follows: + + BACKWARD PASS (recap): + var_deriv_mod = clipped_r^(1/power) * 2/I * power * (sum_i (z(i) - clipped_d) * z'(i)) * rscale^{-(1+power)/power} + .. which for power = -0.5, simplifies to: + var_deriv_mod = -1.0 * (clipped_r)^(-2) * (1/I \sum_i (z(i) - clipped_d) * z'(i)) * scale + + x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + (z(i) - clipped_d) var_deriv_mod +*/ +void* BatchRenormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(SameDim(in, *out) && + (in.NumCols() == dim_ || in.NumCols() == block_dim_)); + if (in.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); + int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(), + orig_cols = in.NumCols(), new_rows = orig_rows * ratio, + new_cols = orig_cols / ratio; + CuSubMatrix in_reshaped(in.Data(), new_rows, new_cols, new_cols), + out_reshaped(out->Data(), new_rows, new_cols, new_cols); + return Propagate(indexes, in_reshaped, &out_reshaped); + } + + // From this point, we can assume that the num-cols of 'in' and 'out' + // equals block_dim_. + + if (!test_mode_) { + // search in the comment above for FORWARD PASS to see what is being + // implemented here. + // if this takes too much time due to multiple different CUDA calls, + // we'll consider making a single kernel for some of it. + Memo *memo = new Memo; + int32 num_frames = in.NumRows(), dim = block_dim_; + memo->num_frames = num_frames; + memo->mean_uvar_scale.Resize(8, dim); + CuSubVector mean(memo->mean_uvar_scale, 0), + uvar(memo->mean_uvar_scale, 1), + scale(memo->mean_uvar_scale, 2), + clipped_r(memo->mean_uvar_scale, 5), + clipped_d(memo->mean_uvar_scale, 6); + + mean.AddRowSumMat(1.0 / num_frames, in, 0.0); + uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0); + scale.CopyFromVec(uvar); + + // by applying this scale at this point, we save a multiply later on. + BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_); + scale.AddVecVec(-var_scale, mean, mean, var_scale); + // at this point, 'scale' contains just the variance (times target-rms^{-2}). + scale.ApplyFloor(0.0); + scale.Add(var_scale * epsilon_); + // Now 'scale' contains the variance floored to zero and then with epsilon + // added [both times 1/target-rms^2]. + scale.ApplyPow(-0.5); + // now 'scale' is the actual scale we'll use. + + // the next command will do no work if out == in, for in-place propagation. + out->CopyFromMat(in); + out->AddVecToRows(-1.0, mean, 1.0); + out->MulColsVec(scale); + + if (!training_begining_) { + // update clipped update + CuVector moving_mean_copy(moving_mean_), moving_stddev_copy(moving_stddev_); + clipped_d.CopyFromVec(mean); + clipped_d.AddVec(-1.0, moving_mean_copy); + moving_stddev_copy.ApplyPow(-1.0); + clipped_d.MulElements(moving_stddev_copy); + + CuVector stddv_tmpt(scale); + stddv_tmpt.ApplyPow(-1.0); + clipped_r.CopyFromVec(stddv_tmpt); + clipped_r.MulElements(moving_stddev_copy); + + clipped_r.ApplyCeiling(r_max_); + clipped_r.ApplyFloor(1.0 / r_max_); + clipped_d.ApplyCeiling(d_max_); + clipped_d.ApplyFloor(- d_max_); + + out->MulColsVec(clipped_r); + out->AddVecToRows(1.0, clipped_d, 1.0); + } else { + clipped_r.Set(1); + clipped_d.Set(0); + } + return static_cast(memo); + } else { + if (offset_.Dim() != block_dim_) { + if (count_ == 0) + KALDI_ERR << "Test mode set in BatchRenormComponent, but no stats."; + else // why was ComputeDerived() not called? + KALDI_ERR << "Code error in BatchRenormComponent"; + } + out->CopyFromMat(in); + out->MulColsVec(scale_); + out->AddVecToRows(1.0, offset_, 1.0); + return NULL; + } +} + +void BatchRenormComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, // unused + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo_in, + Component *to_update, // unused + CuMatrixBase *in_deriv) const { + + KALDI_ASSERT(SameDim(out_value, out_deriv) && + SameDim(out_value, *in_deriv) && + (out_value.NumCols() == dim_ || + out_value.NumCols() == block_dim_)); + if (out_value.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(out_value.Stride() == out_value.NumCols() && + out_deriv.Stride() == out_deriv.NumCols() && + in_deriv->Stride() == in_deriv->NumCols()); + int32 ratio = dim_ / block_dim_, + orig_rows = out_value.NumRows(), + orig_cols = out_value.NumCols(), + new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; + CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, + new_cols, new_cols), + out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols), + in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols); + // we'll never use in_value, so pass it in unchanged. + Backprop(debug_info, indexes, in_value, + out_value_reshaped, out_deriv_reshaped, + memo_in, to_update, &in_deriv_reshaped); + return; + } + + Memo *memo = static_cast(memo_in); + + if (!test_mode_) { + // search above for BACKWARD PASS for a comment describing the math. + KALDI_ASSERT(memo != NULL && "memo not passed into backprop"); + int32 num_frames = memo->num_frames; + KALDI_ASSERT(out_value.NumRows() == num_frames); + CuSubVector + scale(memo->mean_uvar_scale, 2), + var_deriv_mod(memo->mean_uvar_scale, 3), + temp(memo->mean_uvar_scale, 4), + clipped_r(memo->mean_uvar_scale, 5), + clipped_d(memo->mean_uvar_scale, 6); + + // rscale == clipped_r * (var + epsilon)^power == clipped_r * scale_ !! this scale_ is from memo + // and target-rms = 1.0, so scale = rscale + scale.MulElements(clipped_r); + + // var_deriv_mod is going to contain: + // -1.0 * (clipped_r)^(-2) * (1/I \sum_i (z(i) - clipped_d) * z'(i)) * scale + // but for now we don't have the power of 'scale', we'll add that later. + BaseFloat coeff = -1.0 / num_frames; + // z(i) - clipped_d + CuMatrix out_value_minus_clipped_d(out_value); + out_value_minus_clipped_d.AddVecToRows(-1.0, clipped_d, 1.0); + + // -1.0 * (1/I \sum_i (z(i) - clipped_d) * z'(i)) + var_deriv_mod.AddDiagMatMat(coeff, out_value_minus_clipped_d, kTrans, + out_deriv, kNoTrans, 0.0); + // -1.0 * (1/I \sum_i (z(i) - clipped_d) * z'(i)) * scale + var_deriv_mod.MulElements(scale); + // -1.0 * (clipped_r)^(-2) * (1/I \sum_i (z(i) - clipped_d) * z'(i)) * scale + clipped_r.ApplyPow(-2.0); + var_deriv_mod.MulElements(clipped_r); + clipped_r.ApplyPow(-0.5); + + temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0); + // the following statement does no work if in_deriv and out_deriv are the + // same matrix. + in_deriv->CopyFromMat(out_deriv); + in_deriv->AddVecToRows(1.0, temp); + // At this point, *in_deriv contains + // (z'(i) - 1/I * \sum_i z'(i)) + in_deriv->MulColsVec(scale); + // At this point, *in_deriv contains + // scale * (z'(i) - 1/I * \sum_i z'(i)) + + in_deriv->AddMatDiagVec(1.0, out_value_minus_clipped_d, kNoTrans, + var_deriv_mod, 1.0); + + // At this point, *in_deriv contains what we described in the comment + // starting BATCHNORM_MATH as: + // x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + (z(i) - clipped_d) var_deriv_mod + + // to scale the memo scale back to its original value + scale.DivElements(clipped_r); + } else { + KALDI_ASSERT(offset_.Dim() == block_dim_); + // the next call does no work if they point to the same memory. + in_deriv->CopyFromMat(out_deriv); + in_deriv->MulColsVec(scale_); + } +} + +void BatchRenormComponent::StoreStats( + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo_in) { + // in test mode this component does not store stats, it doesn't provide the + // kStoresStats flag. + KALDI_ASSERT(!test_mode_); + KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_); + if (out_value.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(out_value.Stride() == out_value.NumCols()); + int32 ratio = dim_ / block_dim_, + orig_rows = out_value.NumRows(), + orig_cols = out_value.NumCols(), + new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; + CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, + new_cols, new_cols); + // we'll never use in_value, so just pass it in unchanged. + StoreStats(in_value, out_value_reshaped, memo_in); + return; + } + + Memo *memo = static_cast(memo_in); + KALDI_ASSERT(out_value.NumRows() == memo->num_frames); + + CuSubVector mean(memo->mean_uvar_scale, 0), + uvar(memo->mean_uvar_scale, 1), + scale(memo->mean_uvar_scale, 2); + KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0); + BaseFloat num_frames = memo->num_frames; + if (stats_sum_.Dim() != block_dim_) { + stats_sum_.Resize(block_dim_); + stats_sumsq_.Resize(block_dim_); + moving_mean_.Resize(block_dim_); + moving_stddev_.Resize(block_dim_); + KALDI_ASSERT(count_ == 0); + } + count_ += num_frames; + stats_sum_.AddVec(num_frames, mean, 1.0); + stats_sumsq_.AddVec(num_frames, uvar, 1.0); + if (training_begining_) { + training_begining_ = false; + moving_mean_.CopyFromVec(mean); + scale.ApplyPow(-1.0); + moving_stddev_.CopyFromVec(scale); + scale.ApplyPow(-1.0); + } else { + BaseFloat alpha_2 = 1.0 - alpha_; + + moving_mean_.Scale(alpha_2); + moving_mean_.AddVec(alpha_, mean); + + scale.ApplyPow(-1.0); + moving_stddev_.Scale(alpha_2); + moving_stddev_.AddVec(alpha_, scale); + scale.ApplyPow(-1.0); + } +} + +void BatchRenormComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &block_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &epsilon_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &target_rms_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &test_mode_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_); + ExpectToken(is, binary, ""); + stats_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + stats_sumsq_.Read(is, binary); + stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); + stats_sum_.Scale(count_); + stats_sumsq_.Scale(count_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &training_begining_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &r_max_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &d_max_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &alpha_); + ExpectToken(is, binary, ""); + moving_mean_.Read(is, binary); + ExpectToken(is, binary, ""); + moving_stddev_.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &average_count_); + ExpectToken(is, binary, ""); + ComputeDerived(); + Check(); +} + +void BatchRenormComponent::Write(std::ostream &os, bool binary) const { + Check(); + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, block_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, epsilon_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, target_rms_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, test_mode_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + CuVector mean(stats_sum_), var(stats_sumsq_); + if (count_ != 0) { + mean.Scale(1.0 / count_); + var.Scale(1.0 / count_); + var.AddVecVec(-1.0, mean, mean, 1.0); + } + WriteToken(os, binary, ""); + mean.Write(os, binary); + WriteToken(os, binary, ""); + var.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, training_begining_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, r_max_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, d_max_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, alpha_); + WriteToken(os, binary, ""); + moving_mean_.Write(os, binary); + WriteToken(os, binary, ""); + moving_stddev_.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, average_count_); + WriteToken(os, binary, ""); +} + +void BatchRenormComponent::Scale_Training(BaseFloat scale) { + KALDI_WARN << "Scale during training : "<< scale; + if (scale == 0) { + count_ = 0.0; + average_count_ = 0.0; + stats_sum_.SetZero(); + stats_sumsq_.SetZero(); + moving_mean_.SetZero(); + moving_stddev_.SetZero(); + } else { + count_ *= scale; + stats_sum_.Scale(scale); + stats_sumsq_.Scale(scale); + } +} + +void BatchRenormComponent::Scale(BaseFloat scale) { + KALDI_WARN << "Scale during averaging : " << scale; + if (scale == 0) { + count_ = 0.0; + average_count_ = 0.0; + stats_sum_.SetZero(); + stats_sumsq_.SetZero(); + moving_mean_.SetZero(); + moving_stddev_.SetZero(); + } else { + count_ *= scale; + average_count_ *= scale; + stats_sum_.Scale(scale); + stats_sumsq_.Scale(scale); + moving_mean_.Scale(scale); + moving_stddev_.Scale(scale); + } +} + +void BatchRenormComponent::Add(BaseFloat alpha, const Component &other_in) { + const BatchRenormComponent *other = + dynamic_cast(&other_in); + count_ += alpha * other->count_; + stats_sum_.AddVec(alpha, other->stats_sum_); + stats_sumsq_.AddVec(alpha, other->stats_sumsq_); + + KALDI_WARN << "Average_count : " << average_count_; + KALDI_WARN << "Other Average_count : " << other->average_count_; + KALDI_WARN << "Add alpha scale : "<< alpha; + double average_count_copy(average_count_); + CuVector moving_mean_copy(moving_mean_), moving_stddev_copy(moving_stddev_); + KALDI_WARN << "Moving mean copy : " << SummarizeVector(moving_mean_copy); + KALDI_WARN << "Moving stddev copy : "<< SummarizeVector(moving_stddev_copy); + average_count_ += alpha * other->average_count_; + moving_mean_.AddVec(alpha, other->moving_mean_); + moving_stddev_.AddVec(alpha, other->moving_stddev_); + moving_mean_.Scale(1.0 / average_count_); + moving_stddev_.Scale(1.0 / average_count_); + KALDI_WARN << "Moving mean copy after: " << SummarizeVector(moving_mean_copy); + KALDI_WARN << "Moving stddev copy after: "<< SummarizeVector(moving_stddev_copy); + average_count_ = average_count_copy; + // this operation might change offset_ and scale_, so we recompute them + // in this instance (but not in Scale()). + ComputeDerived(); +} + +void BatchRenormComponent::ZeroStats() { + // We only zero the stats if we're not in test mode. In test mode, this would + // be dangerous as the stats are the source for the transform, and zeroing + // them and then calling ComputeDerived() again would remove the transform + // parameters (offset_ and scale_). + if (!test_mode_) { + count_ = 0.0; + average_count_ = 1.0; + stats_sum_.SetZero(); + stats_sumsq_.SetZero(); + } +} + +void BatchRenormComponent::SetBatchRenormCorrections(BaseFloat r_max, BaseFloat d_max) { + r_max_ = r_max; + d_max_ = d_max; + } } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index 37ad624d0f0..d17b3bf2ea3 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -7,6 +7,7 @@ // 2014-2015 Guoguo Chen // 2015 Daniel Galvez // 2015 Tom Ko +// 2018 Gaofeng Cheng (Institute of Acoustics, Chinese Academy of Sciences) // See ../../COPYING for clarification regarding multiple authors // @@ -223,6 +224,7 @@ class BatchNormComponent: public Component { // Note: the offset and scale will only be nonempty in 'test mode'. const CuVector &Offset() const { return offset_; } const CuVector &Scale() const { return scale_; } + bool IsBatchRenorm() const { return false; } private: @@ -294,6 +296,195 @@ class BatchNormComponent: public Component { CuVector scale_; }; +/* + BatchRenormComponent + + This implements batch-renorm normalization; See details in : https://arxiv.org/abs/1702.03275 + + If you want to combine this with the trainable offset and scale that the + original BatchNorm paper used, then follow this by the + ScaleAndOffsetComponent. + + It's a simple component (uses the kSimpleComponent flag), but it is unusual in + that it will give different results if you call it on half the matrix at a + time. Most of the time this would be pretty harmless, so we still return the + kSimpleComponent flag. We may have to modify the test code a little to + account for this, or possibly remove the kSimpleComponent flag. In some sense + each output Index depends on every input Index, but putting those dependencies + explicitly into the dependency-tracking framework as a GeneralComponent + would be very impractical and might lead to a lot of unnecessary things being + computed. You have to be a bit careful where you put this component, and understand + what you're doing e.g. putting it in the path of a recurrence is a bit problematic + if the minibatch size is small. + + Accepted configuration values: + dim Dimension of the input and output + block-dim Defaults to 'dim', but may be set to a divisor + of 'dim'. In this case, each block of dimension 'block-dim' + is treated like a separate row of the input matrix, which + means that the stats from n'th element of each + block are pooled into one class, for each n. + epsilon Small term added to the variance that is used to prevent + division by zero + target-rms This defaults to 1.0, but if set, for instance, to 2.0, + it will normalize the standard deviation of the output to + 2.0. 'target-stddev' might be a more suitable name, but this + was chosen for consistency with NormalizeComponent. + alpha This is the decay-momentum used for the moving-averages, + see details in : https://arxiv.org/abs/1702.03275 + */ +class BatchRenormComponent: public Component { + public: + + BatchRenormComponent() { } + + // call this with 'true' to set 'test mode' where the batch normalization is + // done with stored stats. There won't normally be any need to specially + // accumulate these stats; they are stored as a matter of course on each + // iteration of training, as for NonlinearComponents, and we'll use the stats + // from the most recent [script-level] iteration. + // (Note: it will refuse to actually set test-mode to true if there + // are no stats stored.) + void SetTestMode(bool test_mode); + + // constructor using another component + BatchRenormComponent(const BatchRenormComponent &other); + + virtual int32 InputDim() const { return dim_; } + virtual int32 OutputDim() const { return dim_; } + + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + virtual std::string Type() const { return "BatchRenormComponent"; } + virtual int32 Properties() const { + // If the block-dim is less than the dim, we need the input and output + // matrices to be contiguous (stride==num-cols), as we'll be reshaping + // internally. This is not much of a cost, because this will be used + // in convnets where we have to do this anyway. + return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace| + kBackpropInPlace| + (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)| + (test_mode_ ? 0 : kUsesMemo|kStoresStats); + } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *, // to_update, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); // This Read function + // requires that the Component has the correct type. + + /// Write component to stream + virtual void Write(std::ostream &os, bool binary) const; + virtual Component* Copy() const { return new BatchRenormComponent(*this); } + + virtual void Scale(BaseFloat scale); + virtual void Scale_Training(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + virtual void ZeroStats(); + + + virtual void DeleteMemo(void *memo) const { delete static_cast(memo); } + + virtual void StoreStats(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo); + virtual void SetBatchRenormCorrections(BaseFloat r_max, BaseFloat d_max); + + // Members specific to this component type. + // Note: the offset and scale will only be nonempty in 'test mode'. + const CuVector &Offset() const { return offset_; } + const CuVector &Scale() const { return scale_; } + bool IsBatchRenorm() const { return true; } + + private: + + struct Memo { + // number of frames (after any reshaping). + int32 num_frames; + // 'sum_sumsq_scale' is of dimension 5 by block_dim_: + // Row 0 = mean = the mean of the rows of the input + // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames). + // Row 2 = scale = the scale of the renormalization. + // Rows 3 and 4 are used as temporaries in Backprop. + CuMatrix mean_uvar_scale; + }; + + void Check() const; + + // this function is used in a couple of places; it turns the raw stats into + // the offset/scale term of a normalizing transform. + static void ComputeOffsetAndScale(double count, + BaseFloat epsilon, + const Vector &stats_sum, + const Vector &stats_sumsq, + Vector *offset, + Vector *scale); + // computes derived parameters offset_ and scale_. + void ComputeDerived(); + + // Dimension of the input and output. + int32 dim_; + // This would normally be the same as dim_, but if it's less (and it must be > + // 0 and must divide dim_), then each separate block of the input of dimension + // 'block_dim_' is treated like a separate frame for the purposes of + // normalization. This can be used to implement spatial batch normalization + // for convolutional setups-- assuming the filter-dim has stride 1, which it + // always will in the new code in nnet-convolutional-component.h. + int32 block_dim_; + + // Used to avoid exact-zero variances, epsilon has the dimension of a + // covariance. + BaseFloat epsilon_; + + // This value will normally be 1.0, which is the default, but you can set it + // to other values as a way to control how fast the following layer learns + // (smaller -> slower). The same config exists in NormalizeComponent. + BaseFloat target_rms_; + + // This is true if we want the batch normalization to operate in 'test mode' + // meaning the data mean and stddev used for the normalization are fixed + // quantities based on previously accumulated stats. Note: the stats we use + // for this are based on the same 'StoreStats' mechanism as we use for + // components like SigmoidComponent and ReluComponent; we'll be using + // the stats from the most recent [script-level] iteration of training. + bool test_mode_; + + double average_count_; + // total count of stats stored by StoreStats(). + double count_; + // sum-of-data component of stats of input data. + CuVector stats_sum_; + // sum-of-squared component of stats of input data. + CuVector stats_sumsq_; + + // + bool training_begining_; + + // the maximum allowed correction for batch renorm. + // The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, + // with `r` clipped to [1 / rmax, rmax], and `d` to [-dmax, dmax]. + BaseFloat r_max_; + BaseFloat d_max_; + + // decay-factor for the moving averages. + BaseFloat alpha_; + + CuVector moving_mean_; + CuVector moving_stddev_; + + // [TODO] : a more robust method to generate scale_ and offset_ for inference + // temporarily, offset_ and scale_ is picked from one of the parallel models + CuVector offset_; + CuVector scale_; +}; } // namespace nnet3 diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index e020f8fc6a7..d8002fe3d42 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -301,6 +301,22 @@ void SetNnetAsGradient(Nnet *nnet) { } void ScaleNnet(BaseFloat scale, Nnet *nnet) { + if (scale == 1.0) return; + else { + for (int32 c = 0; c < nnet->NumComponents(); c++) { + Component *comp = nnet->GetComponent(c); + BatchRenormComponent *bc = dynamic_cast(comp); + if (bc != NULL) { + bc->Scale_Training(scale); + } else { + Component *comp = nnet->GetComponent(c); + comp->Scale(scale); + } + } + } +} + +void ScaleNnetForAverage(BaseFloat scale, Nnet *nnet) { if (scale == 1.0) return; else { for (int32 c = 0; c < nnet->NumComponents(); c++) { @@ -520,6 +536,9 @@ bool HasBatchnorm(const Nnet &nnet) { const Component *comp = nnet.GetComponent(c); if (dynamic_cast(comp) != NULL) return true; + comp = nnet.GetComponent(c); + if (dynamic_cast(comp) != NULL) + return true; } return false; } @@ -532,8 +551,15 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, for (int32 c = 0; c < nnet->NumComponents(); c++) { Component *comp = nnet->GetComponent(c); BatchNormComponent *bc = dynamic_cast(comp); - if (bc != NULL) + if (bc != NULL) { bc->Scale(batchnorm_stats_scale); + } else { + comp = nnet->GetComponent(c); + BatchRenormComponent *bc = dynamic_cast(comp); + if (bc != NULL) { + bc->Scale_Training(batchnorm_stats_scale); + } + } } } @@ -556,8 +582,15 @@ void SetBatchnormTestMode(bool test_mode, Nnet *nnet) { for (int32 c = 0; c < nnet->NumComponents(); c++) { Component *comp = nnet->GetComponent(c); BatchNormComponent *bc = dynamic_cast(comp); - if (bc != NULL) + if (bc != NULL) { bc->SetTestMode(test_mode); + } else { + comp = nnet->GetComponent(c); + BatchRenormComponent *bc = dynamic_cast(comp); + if (bc != NULL) { + bc->SetTestMode(test_mode); + } + } } } @@ -1641,18 +1674,32 @@ class ModelCollapser { const BatchNormComponent *batchnorm_component = dynamic_cast( nnet_->GetComponent(component_index1)); - if (batchnorm_component == NULL) + const BatchRenormComponent *batchrenorm_component = + dynamic_cast( + nnet_->GetComponent(component_index1)); + if (batchnorm_component != NULL && batchrenorm_component != NULL) { + KALDI_ERR << "Something seems very wrong, a component belongs to both batch-norm and batch-renorm ?"; + } else if (batchnorm_component == NULL && batchrenorm_component == NULL) { return -1; - - if (batchnorm_component->Offset().Dim() == 0) { - KALDI_ERR << "Expected batch-norm components to have test-mode set."; + } else if (batchnorm_component != NULL) { + if (batchnorm_component->Offset().Dim() == 0) { + KALDI_ERR << "Expected batch-norm components to have test-mode set."; + } + std::string batchnorm_component_name = nnet_->GetComponentName(component_index1); + return GetDiagonallyPreModifiedComponentIndex(batchnorm_component->Offset(), + batchnorm_component->Scale(), + batchnorm_component_name, + component_index2); + } else { + if (batchrenorm_component->Offset().Dim() == 0) { + KALDI_ERR << "Expected batch-norm components to have test-mode set."; + } + std::string batchrenorm_component_name = nnet_->GetComponentName(component_index1); + return GetDiagonallyPreModifiedComponentIndex(batchrenorm_component->Offset(), + batchrenorm_component->Scale(), + batchrenorm_component_name, + component_index2); } - std::string batchnorm_component_name = nnet_->GetComponentName( - component_index1); - return GetDiagonallyPreModifiedComponentIndex(batchnorm_component->Offset(), - batchnorm_component->Scale(), - batchnorm_component_name, - component_index2); } diff --git a/src/nnet3bin/nnet3-average.cc b/src/nnet3bin/nnet3-average.cc index d794e37e50d..face910a654 100644 --- a/src/nnet3bin/nnet3-average.cc +++ b/src/nnet3bin/nnet3-average.cc @@ -61,7 +61,7 @@ void ReadModels(std::vector > models_and_weigh try { int32 n = models_and_weights.size(); ReadKaldiObject(models_and_weights[0].first, output_nnet); - ScaleNnet(models_and_weights[0].second, output_nnet); + ScaleNnetForAverage(models_and_weights[0].second, output_nnet); for (int32 i = 1; i < n; i++) { Nnet nnet; ReadKaldiObject(models_and_weights[i].first, &nnet); diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index 4bcf4cdfb6d..1050148fbd6 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -66,7 +66,7 @@ double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, void UpdateNnetMovingAverage(int32 num_models, const Nnet &nnet, Nnet *moving_average_nnet) { KALDI_ASSERT(NumParameters(nnet) == NumParameters(*moving_average_nnet)); - ScaleNnet((num_models - 1.0) / num_models, moving_average_nnet); + ScaleNnetForAverage((num_models - 1.0) / num_models, moving_average_nnet); AddNnet(nnet, 1.0 / num_models, moving_average_nnet); } @@ -106,7 +106,7 @@ int main(int argc, char *argv[]) { "if the number of models provided to this binary is quite " "large (e.g. several hundred)."); po.Register("batchnorm-test-mode", &batchnorm_test_mode, - "If true, set test-mode to true on any BatchNormComponents " + "If true, set test-mode to true on any BatchNormComponents or BatchRenormComponents" "while evaluating objectives."); po.Register("dropout-test-mode", &dropout_test_mode, "If true, set test-mode to true on any DropoutComponents and " diff --git a/src/online2/online-gmm-decodable.h b/src/online2/online-gmm-decodable.h index c037ad0efe4..1a1d37ba2a2 100644 --- a/src/online2/online-gmm-decodable.h +++ b/src/online2/online-gmm-decodable.h @@ -24,8 +24,10 @@ #define KALDI_ONLINE2_ONLINE_GMM_DECODABLE_H_ #include "itf/online-feature-itf.h" -#include "gmm/decodable-am-diag-gmm.h" #include "matrix/matrix-lib.h" +#include "itf/decodable-itf.h" +#include "gmm/am-diag-gmm.h" +#include "hmm/transition-model.h" namespace kaldi { @@ -37,20 +39,20 @@ class DecodableDiagGmmScaledOnline : public DecodableInterface { const BaseFloat scale, OnlineFeatureInterface *input_feats); - + /// Returns the scaled log likelihood virtual BaseFloat LogLikelihood(int32 frame, int32 index); - + virtual bool IsLastFrame(int32 frame) const; - virtual int32 NumFramesReady() const; - + virtual int32 NumFramesReady() const; + /// Indices are one-based! This is for compatibility with OpenFst. virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); } private: void CacheFrame(int32 frame); - + OnlineFeatureInterface *features_; const AmDiagGmm &ac_model_; BaseFloat ac_scale_; diff --git a/src/online2/online-speex-wrapper.cc b/src/online2/online-speex-wrapper.cc index 0af5bd90bd0..e41a812ca32 100644 --- a/src/online2/online-speex-wrapper.cc +++ b/src/online2/online-speex-wrapper.cc @@ -18,7 +18,7 @@ // limitations under the License. #include -#include "online-speex-wrapper.h" +#include "online2/online-speex-wrapper.h" namespace kaldi { diff --git a/src/online2/onlinebin-util.cc b/src/online2/onlinebin-util.cc index 74c594eeb79..f143ebbc5f7 100644 --- a/src/online2/onlinebin-util.cc +++ b/src/online2/onlinebin-util.cc @@ -20,7 +20,7 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#include "onlinebin-util.h" +#include "online2/onlinebin-util.h" namespace kaldi { diff --git a/src/tree/tree-renderer.cc b/src/tree/tree-renderer.cc index cbc2ce05677..bbaa5cda162 100644 --- a/src/tree/tree-renderer.cc +++ b/src/tree/tree-renderer.cc @@ -19,6 +19,8 @@ #include "tree/tree-renderer.h" +#include "tree/context-dep.h" + namespace kaldi { const int32 TreeRenderer::kEdgeWidth = 1; const int32 TreeRenderer::kEdgeWidthQuery = 3; diff --git a/src/tree/tree-renderer.h b/src/tree/tree-renderer.h index 5e0b0d89198..78f4b9aa403 100644 --- a/src/tree/tree-renderer.h +++ b/src/tree/tree-renderer.h @@ -23,7 +23,6 @@ #include "base/kaldi-common.h" #include "tree/event-map.h" #include "util/common-utils.h" -#include "hmm/transition-model.h" #include "fst/fstlib.h" namespace kaldi { diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh index 52332dfed05..1b63c4c99d9 100755 --- a/tools/extras/check_dependencies.sh +++ b/tools/extras/check_dependencies.sh @@ -66,7 +66,7 @@ if ! echo "#include " | $CXX -E - >&/dev/null; then add_packages zlib-devel zlib1g-dev zlib-devel fi -for f in make automake autoconf patch grep bzip2 gzip wget git sox; do +for f in make automake autoconf patch grep bzip2 gzip unzip wget git sox; do if ! which $f >&/dev/null; then echo "$0: $f is not installed." add_packages $f $f $f