diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh index 4eb665fc702..ad90710b13f 100755 --- a/egs/iam/v1/local/chain/compare_wer.sh +++ b/egs/iam/v1/local/chain/compare_wer.sh @@ -11,6 +11,7 @@ if [ $# == 0 ]; then echo "e.g.: $0 exp/chain/cnn{1a,1b}" exit 1 fi +. ./path.sh echo "# $0 $*" used_epochs=false @@ -26,6 +27,13 @@ for x in $*; do done echo +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi @@ -57,3 +65,10 @@ for x in $*; do printf "% 10s" $prob done echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh index 3b1571091c1..05cb9948bd9 100755 --- a/egs/iam/v1/local/chain/run_cnn_1a.sh +++ b/egs/iam/v1/local/chain/run_cnn_1a.sh @@ -7,9 +7,15 @@ # steps/info/chain_dir_info.pl exp/chain/cnn_1a/ # exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) -# cat exp/chain/cnn_1a/decode_test/scoring_kaldi/best_* -# %WER 5.94 [ 3913 / 65921, 645 ins, 1466 del, 1802 sub ] exp/chain/cnn_1a/decode_test//cer_11_0.0 -# %WER 9.13 [ 1692 / 18542, 162 ins, 487 del, 1043 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0 +# local/chain/compare_wer.sh exp/chain/cnn_1a/ +# System cnn_1a +# WER 18.58 +# CER 10.17 +# Final train prob -0.0122 +# Final valid prob -0.0999 +# Final train prob (xent) -0.5652 +# Final valid prob (xent) -0.9758 +# Parameters 4.36M set -e -o pipefail @@ -40,7 +46,7 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_test=lang_unk # End configuration section. echo "$0 $@" # Print the command line for logging diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh index ddf596a6126..d6d0ee780f4 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh @@ -1,20 +1,20 @@ #!/bin/bash # chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. -# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/ -# System cnn_chainali_1a cnn_chainali_1b -# WER 6.69 6.25 -# Final train prob -0.0132 -0.0041 -# Final valid prob -0.0509 -0.0337 -# Final train prob (xent) -0.6393 -0.6287 -# Final valid prob (xent) -1.0116 -0.9064 + +# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ +# System cnn_1a cnn_chainali_1b +# WER 18.58 14.67 +# CER 10.17 7.31 +# Final train prob -0.0122 0.0042 +# Final valid prob -0.0999 -0.0256 +# Final train prob (xent) -0.5652 -0.6282 +# Final valid prob (xent) -0.9758 -0.9096 +# Parameters 4.36M 3.96M # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ # exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) -# cat exp/chain/cnn_chainali_1b/decode_test/scoring_kaldi/best_* -# %WER 3.94 [ 2600 / 65921, 415 ins, 1285 del, 900 sub ] exp/chain/cnn_chainali_1b/decode_test/cer_10_0.0 -# %WER 6.25 [ 1158 / 18542, 103 ins, 469 del, 586 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.0 set -e -o pipefail @@ -46,7 +46,7 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_test=lang_unk # End configuration section. echo "$0 $@" # Print the command line for logging diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh index 0c3bb325023..8b981de3abd 100755 --- a/egs/iam/v1/local/prepare_dict.sh +++ b/egs/iam/v1/local/prepare_dict.sh @@ -15,29 +15,27 @@ cat data/train/text | \ perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \ sort -u > $dir/nonsilence_phones.txt -# Now list all the unique words (that use only the above letters) -# in data/train/text and LOB+Brown corpora with their comprising -# letters as their transcription. (Letter # is replaced with ) +# Now use the pocolm's wordlist which is the most N frequent words in +# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising +# letters as their transcription. Only include words that use the above letters. +# (Letter # is replaced with ) export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") -cut -d' ' -f2- data/train/text | \ - cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt \ - data/local/browncorpus/brown.txt - | \ +cat data/local/local_lm/data/wordlist | \ perl -e '$letters=$ENV{letters}; -while(<>){ @A = split; - foreach(@A) { - if(! $seen{$_} && $_ =~ m/^[$letters]+$/){ - $seen{$_} = 1; - $trans = join(" ", split(//)); +while(<>){ + chop; + $w = $_; + if($w =~ m/^[$letters]+$/){ + $trans = join(" ", split(//, $w)); $trans =~ s/#//g; - print "$_ $trans\n"; + print "$w $trans\n"; } - } -}' | sort > $dir/lexicon.txt +}' | sort -u > $dir/lexicon.txt -sed -i '' "s/#//" $dir/nonsilence_phones.txt +sed -i "s/#//" $dir/nonsilence_phones.txt echo ' SIL' >> $dir/lexicon.txt echo ' SIL' >> $dir/lexicon.txt diff --git a/egs/iam/v1/local/remove_test_utterances_from_lob.py b/egs/iam/v1/local/remove_test_utterances_from_lob.py new file mode 100755 index 00000000000..1b414ef47f6 --- /dev/null +++ b/egs/iam/v1/local/remove_test_utterances_from_lob.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# Copyright 2018 Ashish Arora + +import argparse +import os +import numpy as np +import sys +import re + +parser = argparse.ArgumentParser(description="""Removes dev/test set lines + from the LOB corpus. Reads the + corpus from stdin, and writes it to stdout.""") +parser.add_argument('dev_text', type=str, + help='dev transcription location.') +parser.add_argument('test_text', type=str, + help='test transcription location.') +args = parser.parse_args() + +def remove_punctuations(transcript): + char_list = [] + for char in transcript: + if char.isdigit() or char == '+' or char == '~' or char == '?': + continue + if char == '#' or char == '=' or char == '-' or char == '!': + continue + if char == ',' or char == '.' or char == ')' or char == '\'': + continue + if char == '(' or char == ':' or char == ';' or char == '"': + continue + char_list.append(char) + return char_list + + +def remove_special_words(words): + word_list = [] + for word in words: + if word == '' or word == '#': + continue + word_list.append(word) + return word_list + + +# process and add dev/eval transcript in a list +# remove special words, punctuations, spaces between words +# lowercase the characters +def read_utterances(text_file_path): + with open(text_file_path, 'rt') as in_file: + for line in in_file: + words = line.strip().split() + words_wo_sw = remove_special_words(words) + transcript = ''.join(words_wo_sw[1:]) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + utterance_dict[words_wo_sw[0]] = transcript + + +### main ### + +# read utterances and add it to utterance_dict +utterance_dict = dict() +read_utterances(args.dev_text) +read_utterances(args.test_text) + +# read corpus and add it to below lists +corpus_text_lowercase_wo_sc = list() +corpus_text_wo_sc = list() +original_corpus_text = list() +for line in sys.stdin: + original_corpus_text.append(line) + words = line.strip().split() + words_wo_sw = remove_special_words(words) + + transcript = ''.join(words_wo_sw) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_lowercase_wo_sc.append(transcript) + + transcript = ''.join(words_wo_sw) + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_wo_sc.append(transcript) + +# find majority of utterances below +# for utterances which were not found +# add them to remaining_utterances +row_to_keep = [True for i in range(len(original_corpus_text))] +remaining_utterances = dict() +for line_id, line_to_find in utterance_dict.items(): + found_line = False + for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): + # Combine 3 consecutive lines of the corpus into a single line + prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() + curr_words = corpus_text_lowercase_wo_sc[i].strip() + next_words = corpus_text_lowercase_wo_sc[i + 1].strip() + new_line = prev_words + curr_words + next_words + transcript = ''.join(new_line) + if line_to_find in transcript: + found_line = True + row_to_keep[i-1] = False + row_to_keep[i] = False + row_to_keep[i+1] = False + if not found_line: + remaining_utterances[line_id] = line_to_find + + +for i in range(len(original_corpus_text)): + transcript = original_corpus_text[i].strip() + if row_to_keep[i]: + print(transcript) + +print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr) +print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr) +print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr) +print('LOB lines: Before: {} After: {}'.format(len(original_corpus_text), + row_to_keep.count(True)), file=sys.stderr) diff --git a/egs/iam/v1/local/score.sh b/egs/iam/v1/local/score.sh index 31564d25326..d964d70206b 100755 --- a/egs/iam/v1/local/score.sh +++ b/egs/iam/v1/local/score.sh @@ -1,5 +1,157 @@ #!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 +# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the 's +# using local/unk_arc_post_to_transcription.py and also it calls +# steps/scoring/score_kaldi_cer.sh at the end. -steps/scoring/score_kaldi_wer.sh "$@" -steps/scoring/score_kaldi_cer.sh --stage 2 "$@" +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=3 +max_lmwt=13 +iter=final +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 +model_path=`echo $dir |xargs dirname` +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + + +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \ + lattice-arc-post $model_path/final.mdl ark:- - \| \ + local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + compute-wer --text --mode=present \ + "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + + done +fi + + + +if [ $stage -le 1 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi +fi + +steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \ + --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \ + $data $lang_or_graph $dir + +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null + +exit 0; diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh index aa4303d6a28..a673c5b3f2d 100755 --- a/egs/iam/v1/local/train_lm.sh +++ b/egs/iam/v1/local/train_lm.sh @@ -13,6 +13,7 @@ set -e stage=0 +vocab_size=50000 echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh || exit 1; @@ -57,8 +58,10 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true # Using LOB and brown corpus. - cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt > ${dir}/data/text/text.txt - cat data/local/browncorpus/brown.txt >> ${dir}/data/text/text.txt + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ + local/remove_test_utterances_from_lob.py data/test/text data/val/text \ + > ${dir}/data/text/lob.txt + cat data/local/browncorpus/brown.txt >> ${dir}/data/text/brown.txt # use the validation data as the dev set. # Note: the name 'dev' is treated specially by pocolm, it automatically @@ -78,8 +81,8 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from IAM text - cat ${dir}/data/text/{iam,text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count - cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist + cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi order=3 @@ -91,7 +94,7 @@ if [ $stage -le 1 ]; then # Note: if you have more than one order, use a certain amount of words as the # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" - min_counts='train=2 iam=1' + min_counts='brown=2 lob=2 iam=1' wordlist=${dir}/data/wordlist lm_name="`basename ${wordlist}`_${order}" diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py new file mode 100755 index 00000000000..c86d35e4b8a --- /dev/null +++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +# Copyright 2017 Ashish Arora + +import argparse +import sys + +parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") +parser.add_argument('phones', type=str, help='phones and phonesID') +parser.add_argument('words', type=str, help='word and wordID') +parser.add_argument('unk', type=str, default='-', help='location of unk file') +parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +args = parser.parse_args() +### main ### +phone_fh = open(args.phones, 'r') +word_fh = open(args.words, 'r') +unk_fh = open(args.unk,'r') +if args.input_ark == '-': + input_fh = sys.stdin +else: + input_fh = open(args.input_ark,'r') +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +phone_dict = dict()# stores phoneID and phone mapping +phone_data_vect = phone_fh.read().strip().split("\n") +for key_val in phone_data_vect: + key_val = key_val.split(" ") + phone_dict[key_val[1]] = key_val[0] +word_dict = dict() +word_data_vect = word_fh.read().strip().split("\n") +for key_val in word_data_vect: + key_val = key_val.split(" ") + word_dict[key_val[1]] = key_val[0] +unk_val = unk_fh.read().strip().split(" ")[0] + +utt_word_dict = dict() +utt_phone_dict = dict()# stores utteranceID and phoneID +unk_word_dict = dict() +count=0 +for line in input_fh: + line_vect = line.strip().split("\t") + if len(line_vect) < 6: + print "IndexError" + print line_vect + continue + uttID = line_vect[0] + word = line_vect[4] + phones = line_vect[5] + if uttID in utt_word_dict.keys(): + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + else: + count = 0 + utt_word_dict[uttID] = dict() + utt_phone_dict[uttID] = dict() + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + if word == unk_val: # get character sequence for unk + phone_key_vect = phones.split(" ") + phone_val_vect = list() + for pkey in phone_key_vect: + phone_val_vect.append(phone_dict[pkey]) + phone_2_word = list() + for phone_val in phone_val_vect: + phone_2_word.append(phone_val.split('_')[0]) + phone_2_word = ''.join(phone_2_word) + utt_word_dict[uttID][count] = phone_2_word + else: + if word == '0': + word_val = ' ' + else: + word_val = word_dict[word] + utt_word_dict[uttID][count] = word_val + count += 1 + +transcription = "" +for key in sorted(utt_word_dict.iterkeys()): + transcription = key + for index in sorted(utt_word_dict[key].iterkeys()): + value = utt_word_dict[key][index] + transcription = transcription + " " + value + out_fh.write(transcription + '\n') diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index d5f66ca4110..f5c4a2b8f80 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -21,7 +21,6 @@ iam_database=/export/corpora5/handwriting_ocr/IAM . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. - ./local/check_tools.sh if [ $stage -le 0 ]; then @@ -42,17 +41,26 @@ if [ $stage -le 1 ]; then fi if [ $stage -le 2 ]; then + echo "$0: Estimating a language model for decoding..." + # We do this stage before dict preparation because prepare_dict.sh + # generates the lexicon from pocolm's wordlist + local/train_lm.sh --vocab-size 50000 +fi + +if [ $stage -le 3 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang -fi - -if [ $stage -le 3 ]; then - echo "$0: Estimating a language model for decoding..." - local/train_lm.sh utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ data/local/dict/lexicon.txt data/lang_test + echo "$0: Preparing the unk model for open-vocab decoding..." + utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \ + data/local/dict exp/unk_lang_model + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ + --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict "" data/local/temp data/lang_unk + cp data/lang_test/G.fst data/lang_unk/G.fst fi if [ $stage -le 4 ]; then