diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnn_1c.sh b/egs/tedlium/s5_r3/local/chain/run_tdnn_1c.sh new file mode 100755 index 00000000000..5b531fd663d --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/run_tdnn_1c.sh @@ -0,0 +1,249 @@ +#!/bin/bash + +# 1g is as 1f but moving to a factorized TDNN (TDNN-F) model, re-tuning it, and +# switching to unconstrained egs (the last of which gives around 0.1% +# improvement). (Note: I don't believe the Tedlium TDNN models were, +# previously, very well-tuned). + +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1f_sp_bi exp/chain_cleaned/tdnn1g_sp +# System tdnn1f_sp_bi tdnn1g_sp +# WER on dev(orig) 8.9 7.9 +# WER on dev(rescored) 8.1 7.3 +# WER on test(orig) 9.1 8.0 +# WER on test(rescored) 8.6 7.6 +# Final train prob -0.1026 -0.0637 +# Final valid prob -0.1031 -0.0750 +# Final train prob (xent) -1.4370 -0.9792 +# Final valid prob (xent) -1.4670 -0.9951 +# Num-params 6994800 9431072 + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn1g_sp +# exp/chain_cleaned/tdnn1g_sp: num-iters=108 nj=3..12 num-params=9.4M dim=40+100->3600 combine=-0.060->-0.060 (over 2) xent:train/valid[71,107,final]=(-1.30,-0.985,-0.979/-1.29,-1.00,-0.995) logprob:train/valid[71,107,final]=(-0.098,-0.065,-0.064/-0.100,-0.075,-0.075) + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=15 +decode_nj=15 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=1 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1g #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.008" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1024 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width 150,110,100 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 5000000 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py index 848ca61ebe4..d3e012da13c 100755 --- a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py +++ b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py @@ -127,7 +127,7 @@ def read_text(text_file): "Did not get enough columns; line {0} in {1}" "".format(line, text_file.name)) elif len(parts) == 1: - logger.warn("Empty transcript for utterance %s in %s", + logger.warn("Empty transcript for utterance %s in %s", parts[0], text_file.name) yield parts[0], [] else: diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py index a19c5344572..3032a4b434a 100755 --- a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2016 Vimal Manohar # 2016 Johns Hopkins University (author: Daniel Povey) @@ -116,17 +116,17 @@ def OpenFiles(): global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word try: - ctm_edits_out = open(args.ctm_edits_out, 'w') + ctm_edits_out = open(args.ctm_edits_out, 'w', encoding='utf-8') except: sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format( args.ctm_edits_out)) try: - edits_in = open(args.edits_in) + edits_in = open(args.edits_in, encoding='utf-8') except: sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format( args.edits_in)) try: - ctm_in = open(args.ctm_in) + ctm_in = open(args.ctm_in, encoding='utf-8') except: sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format( args.ctm_in)) @@ -138,7 +138,7 @@ def OpenFiles(): print("get_ctm_edits.py: error: if you set the the --symbol-table option " "you must also set the --oov option", file = sys.stderr) try: - f = open(args.symbol_table, 'r') + f = open(args.symbol_table, 'r', encoding='utf-8') for line in f.readlines(): [ word, integer ] = line.split() if int(integer) == args.oov: diff --git a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py index aa71fa47d84..69e0242eafb 100755 --- a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py +++ b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2016 Vimal Manohar # 2016 Johns Hopkins University (author: Daniel Povey) @@ -90,7 +90,7 @@ def read_lang(lang_dir): raise try: - for line in open(lang_dir + '/words.txt').readlines(): + for line in open(lang_dir + '/words.txt', encoding='utf-8').readlines(): [ word, integer ] = line.split() if int(integer) in silence_word_ints: non_scored_words.add(word) diff --git a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py index a33ba85d9fa..3ea217b6589 100755 --- a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py +++ b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py @@ -75,14 +75,14 @@ def ReadEntries(file_handle): # Each entry in the list represents the pronounciation candidate(s) of a word. # For each non- word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g: # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')] -# For each , we split the phones it aligns to into two parts: "nonsil_left", +# For each , we split the phones it aligns to into two parts: "nonsil_left", # which includes phones before the first silphone, and "nonsil_right", which includes -# phones after the last silphone. For example, for : 'V SIL B AH SIL', +# phones after the last silphone. For example, for : 'V SIL B AH SIL', # nonsil_left is 'V' and nonsil_right is empty ''. After processing an entry # in ctm_prons, we put it in "info" as an entry: [utt_id, word, nonsil_right] # only if it's nonsil_right segment is not empty, which may be used when processing # the next word. -# +# # Normally, one non- word is only aligned to one pronounciation candidate. However # when there is a preceding/following , like in the following example, we # assume the phones aligned to should be statistically distributed @@ -90,7 +90,7 @@ def ReadEntries(file_handle): # Thus we append the "nonsil_left" segment of these phones to the pronounciation # of the preceding word, if the last phone of this pronounciation is not a silence phone, # Similarly we can add a pron candidate to the following word. -# +# # For example, for the following part of a ctm_prons file: # 911Mothers_2010W-0010916-0012901-1 other AH DH ER # 911Mothers_2010W-0010916-0012901-1 K AH N SIL B @@ -99,11 +99,11 @@ def ReadEntries(file_handle): # 911Mothers_2010W-0010916-0012901-1 when W EH N # 911Mothers_2010W-0010916-0012901-1 people P IY P AH L # 911Mothers_2010W-0010916-0012901-1 SIL -# 911Mothers_2010W-0010916-0012901-1 heard HH ER +# 911Mothers_2010W-0010916-0012901-1 heard HH ER # 911Mothers_2010W-0010916-0012901-1 D # 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T # 911Mothers_2010W-0010916-0012901-1 my M AY -# +# # The corresponding segment in the "info" list is: # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')] # [911Mothers_2010W-0010916-0012901-1, , 'B' @@ -113,7 +113,7 @@ def ReadEntries(file_handle): # [911Mothers_2010W-0010916-0012901-1, , 'D'] # [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')] # [911Mothers_2010W-0010916-0012901-1, my, set('M AY')] -# +# # Then we accumulate pronouciation stats from "info". Basically, for each occurence # of a word, each pronounciation candidate gets equal soft counts. e.g. In the above # example, each pron candidate of "because" gets a count of 1/4. The stats is stored @@ -139,20 +139,20 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron # So we apply the same merging method in these cases. if word == '' or (word in non_scored_words and word != '' and word != ''): nonsil_left = [] - nonsil_right = [] + nonsil_right = [] for phone in phones: if phone in silphones: break nonsil_left.append(phone) - + for phone in reversed(phones): if phone in silphones: break nonsil_right.insert(0, phone) - + # info[-1][0] is the utt_id of the last entry - if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]: - # pron_ext is a set of extended pron candidates. + if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]: + # pron_ext is a set of extended pron candidates. pron_ext = set() # info[-1][2] is the set of pron candidates of the last entry. for pron in info[-1][2]: @@ -211,7 +211,7 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron stats[(word, phones)] = stats.get((word, phones), 0) + count return stats -def WriteStats(stats, file_handle): +def WriteStats(stats, file_handle): for word_pron, count in stats.items(): print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle) file_handle.close() @@ -222,7 +222,7 @@ def Main(): non_scored_words = ReadEntries(args.non_scored_words_file_handle) optional_silence = ReadEntries(args.optional_silence_file_handle) stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle) - WriteStats(stats, args.stats_file_handle) + WriteStats(stats, args.stats_file_handle) if __name__ == "__main__": Main() diff --git a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py index e41a67705e9..68055729fd9 100755 --- a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py +++ b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2016 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0. @@ -142,16 +142,18 @@ def CompletelyDiscountLowCountStates(self, min_count): hist_to_total_count = self.GetHistToTotalCount() for n in reversed(list(range(2, self.ngram_order))): this_order_counts = self.counts[n] + to_delete = [] for hist in this_order_counts.keys(): if hist_to_total_count[hist] < min_count: # we need to completely back off this count. word_to_count = this_order_counts[hist] - del this_order_counts[hist] # delete the key from the dict. + # mark this key for deleting + to_delete.append(hist) backoff_hist = hist[1:] # this will be a tuple not a list. for word, count in word_to_count.items(): self.AddCount(backoff_hist, word, count) - - + for hist in to_delete: + del this_order_counts[hist] # This backs off the counts according to Kneser-Ney (unmodified, # with interpolation). @@ -200,7 +202,7 @@ def AddTopWords(self, top_words_file): word_to_count = self.counts[0][empty_history] total = sum(word_to_count.values()) try: - f = open(top_words_file) + f = open(top_words_file, mode='r', encoding='utf-8') except: sys.exit("make_one_biased_lm.py: error opening top-words file: " "--top-words=" + top_words_file) diff --git a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py index d6f0d0f6b23..af63ca27d2b 100755 --- a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2016 Vimal Manohar # 2016 Johns Hopkins University (author: Daniel Povey) @@ -105,7 +105,7 @@ def ReadNonScoredWords(non_scored_words_file): global non_scored_words try: - f = open(non_scored_words_file) + f = open(non_scored_words_file, encoding='utf-8') except: sys.exit("modify_ctm_edits.py: error opening file: " "--non-scored-words=" + non_scored_words_file) @@ -317,12 +317,12 @@ def ProcessUtterance(split_lines_of_utt): def ProcessData(): try: - f_in = open(args.ctm_edits_in) + f_in = open(args.ctm_edits_in, encoding='utf-8') except: sys.exit("modify_ctm_edits.py: error opening ctm-edits input " "file {0}".format(args.ctm_edits_in)) try: - f_out = open(args.ctm_edits_out, 'w') + f_out = open(args.ctm_edits_out, 'w', encoding='utf-8') except: sys.exit("modify_ctm_edits.py: error opening ctm-edits output " "file {0}".format(args.ctm_edits_out)) diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py index 39d6cb6ed80..e571fefb84c 100755 --- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py +++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 + # Copyright 2016 Vimal Manohar # 2016 Johns Hopkins University (author: Daniel Povey) @@ -894,7 +895,7 @@ def AccWordStatsForUtterance(split_lines_of_utt, def PrintWordStats(word_stats_out): try: - f = open(word_stats_out, 'w') + f = open(word_stats_out, 'w', encoding='utf-8') except: sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} " "for writing".format(word_stats_out)) @@ -924,23 +925,23 @@ def PrintWordStats(word_stats_out): def ProcessData(): try: - f_in = open(args.ctm_edits_in) + f_in = open(args.ctm_edits_in, encoding='utf-8') except: sys.exit("segment_ctm_edits.py: error opening ctm-edits input " "file {0}".format(args.ctm_edits_in)) try: - text_output_handle = open(args.text_out, 'w') + text_output_handle = open(args.text_out, 'w', encoding='utf-8') except: sys.exit("segment_ctm_edits.py: error opening text output " "file {0}".format(args.text_out)) try: - segments_output_handle = open(args.segments_out, 'w') + segments_output_handle = open(args.segments_out, 'w', encoding='utf-8') except: sys.exit("segment_ctm_edits.py: error opening segments output " "file {0}".format(args.text_out)) if args.ctm_edits_out != None: try: - ctm_edits_output_handle = open(args.ctm_edits_out, 'w') + ctm_edits_output_handle = open(args.ctm_edits_out, 'w', encoding='utf-8') except: sys.exit("segment_ctm_edits.py: error opening ctm-edits output " "file {0}".format(args.ctm_edits_out)) @@ -994,7 +995,7 @@ def ProcessData(): def ReadNonScoredWords(non_scored_words_file): global non_scored_words try: - f = open(non_scored_words_file) + f = open(non_scored_words_file, encoding='utf-8') except: sys.exit("segment_ctm_edits.py: error opening file: " "--non-scored-words=" + non_scored_words_file) @@ -1015,7 +1016,7 @@ def ReadNonScoredWords(non_scored_words_file): oov_symbol = None if args.oov_symbol_file != None: try: - with open(args.oov_symbol_file) as f: + with open(args.oov_symbol_file, encoding='utf-8') as f: line = f.readline() assert len(line.split()) == 1 oov_symbol = line.split()[0] diff --git a/egs/wsj/s5/steps/cleanup/make_biased_lms.py b/egs/wsj/s5/steps/cleanup/make_biased_lms.py index ab508eedc9c..4b1fd320221 100755 --- a/egs/wsj/s5/steps/cleanup/make_biased_lms.py +++ b/egs/wsj/s5/steps/cleanup/make_biased_lms.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import print_function import sys @@ -55,7 +55,7 @@ def ProcessGroupOfLines(group_of_lines): try: command = "steps/cleanup/internal/make_one_biased_lm.py " + args.lm_opts p = subprocess.Popen(command, shell = True, stdin = subprocess.PIPE, - stdout = sys.stdout, stderr = sys.stderr) + stdout = sys.stdout, stderr = sys.stderr) for line in group_of_lines: a = line.split() if len(a) == 0: @@ -63,13 +63,15 @@ def ProcessGroupOfLines(group_of_lines): utterance_id = a[0] # print to utterance-map file print(utterance_id, group_utterance_id, file = utterance_map_file) - rest_of_line = ' '.join(a[1:]) # get rid of utterance id. - print(rest_of_line, file=p.stdin) + rest_of_line = ' '.join(a[1:]) + '\n' # get rid of utterance id. + p.stdin.write(rest_of_line.encode('utf-8')) p.stdin.close() assert p.wait() == 0 - except Exception as e: - sys.exit("make_biased_lms.py: error calling subprocess, command was: " + - command + ", error was : " + str(e)) + except Exception: + sys.stderr.write( + "make_biased_lms.py: error calling subprocess, command was: " + + command) + raise # Print a blank line; this terminates the FST in the Kaldi fst-archive # format. print("") diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh index a50cdb04be4..99c9cbdb1f0 100755 --- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh +++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh @@ -102,6 +102,9 @@ fi if [ -f $srcdir/spk2gender ]; then utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender fi +if [ -f $srcdir/utt2lang ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2lang >$destdir/utt2lang +fi #prepare speed-perturbed utt2dur if [ ! -f $srcdir/utt2dur ]; then