diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh index 05cb9948bd9..e2a6988e6fe 100755 --- a/egs/iam/v1/local/chain/run_cnn_1a.sh +++ b/egs/iam/v1/local/chain/run_cnn_1a.sh @@ -89,7 +89,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -97,7 +97,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r data/$lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -110,7 +110,7 @@ if [ $stage -le 2 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ - data/$lang_test $gmm_dir $lat_dir + data/lang $gmm_dir $lat_dir rm $lat_dir/fsts.*.gz # save space fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh index 2c8b6c91e5a..fc9d09a755b 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh @@ -2,21 +2,6 @@ # chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments -# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_1a/ -# System cnn_chainali_1a cnn_1a -# WER 6.69 9.13 -# Final train prob -0.0128 -0.0297 -# Final valid prob -0.0447 -0.0975 -# Final train prob (xent) -0.6448 -0.5915 -# Final valid prob (xent) -0.9924 -1.0022 - -# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a/ -# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.002->0.000 xent:train/valid[13,20,final]=(-0.929,-0.711,-0.645/-1.16,-1.04,-0.992) logprob:train/valid[13,20,final]=(-0.029,-0.016,-0.013/-0.051,-0.047,-0.045) - -# cat exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_* -# %WER 3.94 [ 2600 / 65921, 549 ins, 837 del, 1214 sub ] exp/chain/cnn_chainali_1a/decode_test/cer_15_0.0 -# %WER 6.69 [ 1241 / 18542, 135 ins, 358 del, 748 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_15_0.5 - set -e -o pipefail stage=0 @@ -28,7 +13,7 @@ gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= @@ -90,7 +75,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -98,7 +83,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r data/$lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -112,7 +97,7 @@ if [ $stage -le 2 ]; then # use the same num-jobs as the alignments steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + ${train_data_dir} data/lang $chain_model_dir $lat_dir cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh index d6d0ee780f4..389e08ad5ba 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh @@ -27,7 +27,7 @@ gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= @@ -89,7 +89,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -97,7 +97,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r data/$lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -111,7 +111,7 @@ if [ $stage -le 2 ]; then # use the same num-jobs as the alignments steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + ${train_data_dir} data/lang $chain_model_dir $lat_dir cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh index 6ff76490303..9cc74093165 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh @@ -25,7 +25,7 @@ gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= @@ -33,7 +33,6 @@ reporting_email= train_stage=-10 xent_regularize=0.1 frame_subsampling_factor=4 -alignment_subsampling_factor=1 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 @@ -75,7 +74,6 @@ tree_dir=exp/chain${nnet3_affix}/tree_chain # you should probably name it differently. lang=data/lang_chain for f in $train_data_dir/feats.scp \ - $train_data_dir/feats.scp $gmm_dir/final.mdl \ $ali_dir/ali.1.gz $gmm_dir/final.mdl; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 done @@ -87,7 +85,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -95,7 +93,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r data/$lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -109,7 +107,7 @@ if [ $stage -le 2 ]; then # use the same num-jobs as the alignments steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + ${train_data_dir} data/lang $chain_model_dir $lat_dir cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts fi @@ -136,12 +134,12 @@ if [ $stage -le 4 ]; then num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - opts="l2-regularize=0.075" - opts_2="l2-regularize=0.075" - opts_3="l2-regularize=0.1" - common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input @@ -153,13 +151,13 @@ if [ $stage -le 4 ]; then conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts # adding the layers for xent branch # This block prints the configs for a separate output that will be @@ -170,8 +168,8 @@ if [ $stage -le 4 ]; then # final-layer learns at a rate independent of the regularization # constant; and the 0.5 was tuned so as to make the relative progress # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $opts_2 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $opts_3 + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi @@ -192,7 +190,9 @@ if [ $stage -le 5 ]; then --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh index 65eeedcc75b..e22fe03899e 100755 --- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh @@ -5,16 +5,16 @@ # local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a # System cnn_1a cnn_chainali_1c e2e_cnn_1a -# WER 18.58 12.84 15.46 -# CER 10.17 6.40 7.21 -# Final train prob -0.0122 -0.0120 -0.0426 -# Final valid prob -0.0999 -0.0199 -0.0724 +# WER 18.58 12.84 14.06 +# CER 10.17 6.40 6.57 +# Final train prob -0.0122 -0.0120 -0.0346 +# Final valid prob -0.0999 -0.0199 -0.0594 # Final train prob (xent) -0.5652 -0.9973 # Final valid prob (xent) -0.9758 -1.1537 # Parameters 4.36M 3.96M 9.13M -# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.040->-0.040 (over 1) logprob:train/valid[13,20,final]=(-0.065,-0.046,-0.043/-0.081,-0.073,-0.072) +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059) set -e @@ -34,8 +34,8 @@ common_egs_dir= l2_regularize=0.00005 frames_per_iter=1000000 cmvn_opts="--norm-means=true --norm-vars=true" -train_set=train_e2e -lang_test=lang_test +train_set=train +lang_test=lang_unk # End configuration section. echo "$0 $@" # Print the command line for logging @@ -74,19 +74,24 @@ if [ $stage -le 1 ]; then --shared-phones true \ --type biphone \ data/$train_set $lang $treedir - cp exp/chain/e2e_base/phone_lm.fst $treedir/ + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst fi if [ $stage -le 2 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - opts="l2-regularize=0.075" - opts_2="l2-regularize=0.075" - opts_3="l2-regularize=0.1" - common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input @@ -98,13 +103,13 @@ if [ $stage -le 2 ]; then conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py index 8cfca5ee830..84e012daedb 100755 --- a/egs/iam/v1/local/make_features.py +++ b/egs/iam/v1/local/make_features.py @@ -124,4 +124,4 @@ def get_scaled_image(im, allowed_lengths = None): write_kaldi_matrix(out_fh, data, image_id) print('Generated features for {} images. Failed for {} (iamge too ' - 'long).'.format(num_ok, num_fail)) + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh index 8b981de3abd..f691d577fba 100755 --- a/egs/iam/v1/local/prepare_dict.sh +++ b/egs/iam/v1/local/prepare_dict.sh @@ -8,6 +8,9 @@ set -e dir=data/local/dict +vocab_size=50000 +. ./utils/parse_options.sh + mkdir -p $dir # First get the set of all letters that occur in data/train/text @@ -22,7 +25,7 @@ cat data/train/text | \ export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") -cat data/local/local_lm/data/wordlist | \ +head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \ perl -e '$letters=$ENV{letters}; while(<>){ chop; diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh index a673c5b3f2d..c47284e7692 100755 --- a/egs/iam/v1/local/train_lm.sh +++ b/egs/iam/v1/local/train_lm.sh @@ -108,7 +108,6 @@ if [ $stage -le 1 ]; then ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - #log-prob: -5.05603614242 [perplexity = 156.967086371] over 19477.0 words fi if [ $stage -le 2 ]; then @@ -118,9 +117,6 @@ if [ $stage -le 2 ]; then prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_big was -5.06654404785 per word [perplexity = 158.625177948] over 19477.0 words - # current results, after adding --limit-unk-history=true: - mkdir -p ${dir}/data/arpa format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz @@ -134,9 +130,6 @@ if [ $stage -le 3 ]; then prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_small was -5.24719139498 per word [perplexity = 190.031793995] over 19477.0 words - # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): - format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz fi diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index f5c4a2b8f80..00c9f682bf2 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -44,22 +44,32 @@ if [ $stage -le 2 ]; then echo "$0: Estimating a language model for decoding..." # We do this stage before dict preparation because prepare_dict.sh # generates the lexicon from pocolm's wordlist - local/train_lm.sh --vocab-size 50000 + local/train_lm.sh --vocab-size 50k fi if [ $stage -le 3 ]; then echo "$0: Preparing dictionary and lang..." - local/prepare_dict.sh + + # This is for training. Use a large vocab size, e.g. 500k to include all the + # training words: + local/prepare_dict.sh --vocab-size 500k --dir data/local/dict # this is for training utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + + # This is for decoding. We use a 50k lexicon to be consistent with the papers + # reporting WERs on IAM: + local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k # this is for decoding + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + data/local/dict_50k "" data/lang_test/temp data/lang_test + utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict_50k/lexicon.txt data/lang_test + echo "$0: Preparing the unk model for open-vocab decoding..." utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \ - data/local/dict exp/unk_lang_model + data/local/dict_50k exp/unk_lang_model utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ --unk-fst exp/unk_lang_model/unk_fst.txt \ - data/local/dict "" data/local/temp data/lang_unk + data/local/dict_50k "" data/lang_unk/temp data/lang_unk cp data/lang_test/G.fst data/lang_unk/G.fst fi @@ -131,5 +141,5 @@ if [ $stage -le 13 ]; then fi if [ $stage -le 14 ]; then - local/chain/run_cnn_chainali_1b.sh --chain-model-dir exp/chain/cnn_1a --stage 2 + local/chain/run_cnn_chainali_1c.sh --chain-model-dir exp/chain/cnn_1a --stage 2 fi diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index d479bfa2a73..c9db5889d29 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -29,7 +29,7 @@ fi mkdir -p data/{train,test}/data if [ $stage -le 1 ]; then - get_image2num_frames.py data/train # This will be needed for the next command + image/get_image2num_frames.py data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. @@ -45,32 +45,38 @@ if [ $stage -le 1 ]; then fi if [ $stage -le 2 ]; then - echo "$0: Preparing dictionary and lang..." - local/prepare_dict.sh - utils/prepare_lang.sh --sil-prob 0.95 \ - data/local/dict "" data/lang/temp data/lang + echo "$0: Estimating a language model for decoding..." + # We do this stage before dict preparation because prepare_dict.sh + # generates the lexicon from pocolm's wordlist + local/train_lm.sh --vocab-size 50k fi if [ $stage -le 3 ]; then - echo "$0: Estimating a language model for decoding..." - local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test -fi + echo "$0: Preparing dictionary and lang..." + + # This is for training. Use a large vocab size, e.g. 500k to include all the + # training words: + local/prepare_dict.sh --vocab-size 500k --dir data/local/dict + utils/prepare_lang.sh --sil-prob 0.95 \ + data/local/dict "" data/lang/temp data/lang + # This is for decoding. We use a 50k lexicon to be consistent with the papers + # reporting WERs on IAM. + local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k + utils/prepare_lang.sh --sil-prob 0.95 data/local/dict_50k \ + "" data/lang_test/temp data/lang_test + utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict_50k/lexicon.txt data/lang_test -if [ $stage -le 4 ]; then - echo "$0: estimating phone language model for the denominator graph" - mkdir -p exp/chain/e2e_base/log - $cmd exp/chain/e2e_base/log/make_phone_lm.log \ - cat data/train/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ - utils/sym2int.pl -f 2- data/lang/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=1000 \ - ark:- exp/chain/e2e_base/phone_lm.fst + echo "$0: Preparing the unk model for open-vocab decoding..." + utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \ + data/local/dict_50k exp/unk_lang_model + utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict_50k "" data/lang_unk/temp data/lang_unk + cp data/lang_test/G.fst data/lang_unk/G.fst fi -if [ $stage -le 5 ]; then - echo "$0: calling the flat-start chain recipe..." +if [ $stage -le 4 ]; then + echo "$0: Calling the flat-start chain recipe..." local/chain/run_flatstart_cnn1a.sh fi