From ea39395ca563bce8fd02b4f3f25f55ad931b49cc Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 7 Aug 2018 14:00:24 -0400 Subject: [PATCH 1/4] Add a BPE-based recipe for IAM --- egs/iam/v2/cmd.sh | 13 + egs/iam/v2/image | 1 + egs/iam/v2/local/chain/compare_wer.sh | 90 +++++++ egs/iam/v2/local/chain/run_cnn_e2eali.sh | 1 + egs/iam/v2/local/chain/run_e2e_cnn.sh | 170 ++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1a.sh | 245 +++++++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1b.sh | 251 +++++++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1c.sh | 253 ++++++++++++++++++ egs/iam/v2/local/check_tools.sh | 43 +++ egs/iam/v2/local/make_features.py | 127 +++++++++ egs/iam/v2/local/prepare_data.sh | 170 ++++++++++++ egs/iam/v2/local/prepare_dict.sh | 50 ++++ egs/iam/v2/local/prepend_words.py | 13 + egs/iam/v2/local/process_data.py | 82 ++++++ .../local/remove_test_utterances_from_lob.py | 117 ++++++++ .../v2/local/remove_wellington_annotations.py | 32 +++ egs/iam/v2/local/score.sh | 155 +++++++++++ egs/iam/v2/local/srilm_train.sh | 49 ++++ egs/iam/v2/local/train_lm.sh | 151 +++++++++++ egs/iam/v2/local/wer_output_filter | 31 +++ egs/iam/v2/path.sh | 9 + egs/iam/v2/run_end2end.sh | 104 +++++++ egs/iam/v2/steps | 1 + egs/iam/v2/utils | 1 + 24 files changed, 2159 insertions(+) create mode 100644 egs/iam/v2/cmd.sh create mode 120000 egs/iam/v2/image create mode 100755 egs/iam/v2/local/chain/compare_wer.sh create mode 120000 egs/iam/v2/local/chain/run_cnn_e2eali.sh create mode 100755 egs/iam/v2/local/chain/run_e2e_cnn.sh create mode 100755 egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh create mode 100755 egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh create mode 100755 egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh create mode 100755 egs/iam/v2/local/check_tools.sh create mode 100755 egs/iam/v2/local/make_features.py create mode 100755 egs/iam/v2/local/prepare_data.sh create mode 100755 egs/iam/v2/local/prepare_dict.sh create mode 100755 egs/iam/v2/local/prepend_words.py create mode 100755 egs/iam/v2/local/process_data.py create mode 100755 egs/iam/v2/local/remove_test_utterances_from_lob.py create mode 100755 egs/iam/v2/local/remove_wellington_annotations.py create mode 100755 egs/iam/v2/local/score.sh create mode 100755 egs/iam/v2/local/srilm_train.sh create mode 100755 egs/iam/v2/local/train_lm.sh create mode 100755 egs/iam/v2/local/wer_output_filter create mode 100755 egs/iam/v2/path.sh create mode 100755 egs/iam/v2/run_end2end.sh create mode 120000 egs/iam/v2/steps create mode 120000 egs/iam/v2/utils diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh new file mode 100644 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/iam/v2/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/iam/v2/image b/egs/iam/v2/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/iam/v2/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh new file mode 100755 index 00000000000..d4076457463 --- /dev/null +++ b/egs/iam/v2/local/chain/compare_wer.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi +. ./path.sh + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) " +for x in $*; do + wer="--" + [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) " +for x in $*; do + cer="--" + [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..ad51803ab0e --- /dev/null +++ b/egs/iam/v2/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1c.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh new file mode 100755 index 00000000000..0a49e980be4 --- /dev/null +++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a +# System cnn_1a cnn_chainali_1c e2e_cnn_1a +# WER 18.52 12.72 12.15 +# CER 10.07 5.99 6.03 +# Final train prob -0.0077 -0.0291 -0.0371 +# Final valid prob -0.0970 -0.0359 -0.0636 +# Final train prob (xent) -0.5484 -0.9781 +# Final valid prob (xent) -0.9643 -1.1544 +# Parameters 4.36M 3.96M 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train +lang_test=lang_unk + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..ba28f681708 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# e2eali_1a is the same as chainali_1c but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a exp/chain/cnn_chainali_1c exp/chain/cnn_e2eali_1a +# System e2e_cnn_1a cnn_chainali_1c cnn_e2eali_1a +# WER 13.87 12.72 12.70 +# CER 6.54 5.99 5.75 +# Final train prob -0.0371 -0.0291 -0.0557 +# Final valid prob -0.0636 -0.0359 -0.0770 +# Final train prob (xent) -0.9781 -0.8847 +# Final valid prob (xent) -1.1544 -1.0370 +# Parameters 9.13M 3.96M 3.95M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a +# exp/chain/cnn_e2eali_1a: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.056->-0.056 (over 1) xent:train/valid[13,20,final]=(-1.47,-0.978,-0.918/-1.54,-1.10,-1.06) logprob:train/valid[13,20,final]=(-0.106,-0.065,-0.056/-0.113,-0.086,-0.079) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..298e7053086 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -0,0 +1,251 @@ +#!/bin/bash + +# e2eali_1b is the same as e2eali_1a but uses unconstrained egs + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1a cnn_e2eali_1b +# WER 10.40 10.33 +# WER (rescored) 10.02 10.10 +# CER 4.97 5.00 +# CER (rescored) 4.83 4.88 +# Final train prob -0.0612 -0.0428 +# Final valid prob -0.0857 -0.0666 +# Final train prob (xent) -0.8990 -0.9210 +# Final valid prob (xent) -1.0024 -1.0264 +# Parameters 3.98M 3.98M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +# exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh new file mode 100755 index 00000000000..ef851c8ae2f --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +# e2eali_1c is the same as e2eali_1b but has fewer CNN layers, smaller +# l2-regularize, more epochs and uses dropout. + + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b exp/chain/cnn_e2eali_1c +# System cnn_e2eali_1b cnn_e2eali_1c +# WER 10.33 10.05 +# WER (rescored) 10.10 9.75 +# CER 5.00 4.76 +# CER (rescored) 4.88 4.68 +# Final train prob -0.0428 -0.0317 +# Final valid prob -0.0666 -0.0630 +# Final train prob (xent) -0.9210 -0.5413 +# Final valid prob (xent) -1.0264 -0.7096 +# Parameters 3.98M 5.12M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c +# exp/chain/cnn_e2eali_1c: num-iters=21 nj=2..4 num-params=5.1M dim=40->392 combine=-0.034->-0.034 (over 1) xent:train/valid[13,20,final]=(-0.953,-0.800,-0.541/-1.03,-0.933,-0.710) logprob:train/valid[13,20,final]=(-0.069,-0.048,-0.032/-0.091,-0.078,-0.063) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b6 #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi diff --git a/egs/iam/v2/local/check_tools.sh b/egs/iam/v2/local/check_tools.sh new file mode 100755 index 00000000000..5b4d3107d3b --- /dev/null +++ b/egs/iam/v2/local/check_tools.sh @@ -0,0 +1,43 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread']" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/iam/v2/local/make_features.py b/egs/iam/v2/local/make_features.py new file mode 100755 index 00000000000..84e012daedb --- /dev/null +++ b/egs/iam/v2/local/make_features.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2018 Hossein Hadian + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. + + eg. local/make_features.py data/train --feat-dim 40 +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('dir', type=str, + help='Source data directory (containing images.scp)') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') + + +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + +def get_scaled_image(im, allowed_lengths = None): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = padding // 2 + right_padding = padding - left_padding + dim_y = im.shape[0] + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + return im_pad1 + +### main ### +data_list_path = os.path.join(args.dir, 'images.scp') + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +allowed_lengths = None +if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scaled = get_scaled_image(im, allowed_lengths) + + if im_scaled is None: + num_fail += 1 + continue + data = np.transpose(im_scaled, (1, 0)) + data = np.divide(data, 255.0) + num_ok += 1 + write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (iamge too ' + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh new file mode 100755 index 00000000000..73d711c73f0 --- /dev/null +++ b/egs/iam/v2/local/prepare_data.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script downloads the IAM handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also downloads the LOB and Brown text corpora. It downloads the database files +# only if they do not already exist in download directory. + +# Eg. local/prepare_data.sh +# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from +# utt2spk file: 000_a01-000u-00 000 +# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 + +stage=0 +download_dir=data/download +wellington_dir= +username= +password= # username and password for downloading the IAM database + # if you have not already downloaded the database, please + # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database + # and provide this script with your username and password. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then + echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files" + echo "exist in your data/local directory this script will fail because the required files" + echo "can't be downloaded automatically (it needs registration)." + echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database" + echo "... and then call this script again with --username --password " + echo "" + exit 1 +fi + +lines=data/local/lines +xml=data/local/xml +ascii=data/local/ascii +bcorpus=data/local/browncorpus +lobcorpus=data/local/lobcorpus +wcorpus=data/local/wellingtoncorpus +data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask +lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz +xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz +data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip +ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz +brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt +lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip +wellington_corpus_loc=/export/corpora5/Wellington/WWC/ +mkdir -p $download_dir data/local + +# download and extact images and transcription +if [ -d $lines ]; then + echo "$0: Not downloading lines images as it is already there." +else + if [ ! -f $download_dir/lines.tgz ]; then + echo "$0: Trying to download lines images..." + wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1; + fi + mkdir -p $lines + tar -xzf $download_dir/lines.tgz -C $lines || exit 1; + echo "$0: Done downloading and extracting lines images" +fi + +if [ -d $xml ]; then + echo "$0: Not downloading transcriptions as it is already there." +else + if [ ! -f $download_dir/xml.tgz ]; then + echo "$0: Trying to download transcriptions..." + wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1; + fi + mkdir -p $xml + tar -xzf $download_dir/xml.tgz -C $xml || exit 1; + echo "$0: Done downloading and extracting transcriptions." +fi + +if [ -d $data_split_info ]; then + echo "$0: Not downloading data split information as it is already there." +else + if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then + echo "$0: Trying to download training and testing data split information..." + wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1; + fi + mkdir -p $data_split_info + unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1; + echo "$0: Done downloading and extracting training and testing data split information" +fi + +if [ -d $ascii ]; then + echo "$0: Not downloading ascii.tgz as it is already there." +else + if [ ! -f $download_dir/ascii.tgz ]; then + echo "$0: trying to download ascii.tgz..." + wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1; + fi + mkdir -p $ascii + tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1; + echo "$0: Done downloading and extracting ascii.tgz" +fi + +if [ -d $lobcorpus ]; then + echo "$0: Not downloading the LOB text corpus as it is already there." +else + if [ ! -f $lobcorpus/0167.zip ]; then + echo "$0: Downloading the LOB text corpus ..." + mkdir -p $lobcorpus + wget -P $lobcorpus/ $lob_corpus_url || exit 1; + fi + unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1; + echo "$0: Done downloading and extracting LOB corpus" +fi + +if [ -d $bcorpus ]; then + echo "$0: Not downloading the Brown corpus as it is already there." +else + if [ ! -f $bcorpus/brown.txt ]; then + mkdir -p $bcorpus + echo "$0: Downloading the Brown text corpus..." + wget -P $bcorpus $brown_corpus_url || exit 1; + fi + echo "$0: Done downloading the Brown text corpus" +fi + +if [ -d $wcorpus ]; then + echo "$0: Not copying Wellington corpus as it is already there." +elif [ ! -z $wellington_dir ]; then + mkdir -p $wcorpus + cp -r $wellington_dir/. $wcorpus + + # Combine Wellington corpora and replace some of their annotations + cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \ + cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt + + cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt + + echo "$0: Done copying Wellington corpus" +else + echo "$0: Wellington Corpus not included because wellington_dir not provided" +fi + +mkdir -p data/{train,test,val} +file_name=largeWriterIndependentTextLineRecognitionTask + +train_old="data/local/$file_name/trainset.txt" +test_old="data/local/$file_name/testset.txt" +val1_old="data/local/$file_name/validationset1.txt" +val2_old="data/local/$file_name/validationset2.txt" + +train_new="data/local/train.uttlist" +test_new="data/local/test.uttlist" +val_new="data/local/validation.uttlist" + +cat $train_old > $train_new +cat $test_old > $test_new +cat $val1_old $val2_old > $val_new + +if [ $stage -le 0 ]; then + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 + + utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt + utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt +fi diff --git a/egs/iam/v2/local/prepare_dict.sh b/egs/iam/v2/local/prepare_dict.sh new file mode 100755 index 00000000000..e21a59c7e92 --- /dev/null +++ b/egs/iam/v2/local/prepare_dict.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +vocab_size=50000 +. ./utils/parse_options.sh + +mkdir -p $dir + +# First get the set of all letters that occur in data/train/text +cat data/train/text | \ + perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \ + sort -u | grep -v "|" > $dir/nonsilence_phones.txt + +# Now use the pocolm's wordlist which is the most N frequent words in +# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising +# letters as their transcription. Only include words that use the above letters. +# (Letter # is replaced with ) + +export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") + +head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \ + perl -e '$letters=$ENV{letters}; $letters=$letters . "|"; +while(<>){ + chop; + $w = $_; + if($w =~ m/^[$letters]+$/){ + $trans = join(" ", split(//, $w)); + $trans =~ s/#//g; + $trans =~ s/\|/SIL/g; + print "$w $trans\n"; + } +}' | sort -u > $dir/lexicon.txt + + +sed -i "s/#//" $dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/iam/v2/local/prepend_words.py b/egs/iam/v2/local/prepend_words.py new file mode 100755 index 00000000000..d53eb8974bf --- /dev/null +++ b/egs/iam/v2/local/prepend_words.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This script, prepend '|' to every words in the transcript to mark +# the beginning of the words for finding the initial-space of every word +# after decoding. + +import sys, io + +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in infile: + output.write(' '.join(["|" + word for word in line.split()]) + '\n') diff --git a/egs/iam/v2/local/process_data.py b/egs/iam/v2/local/process_data.py new file mode 100755 index 00000000000..fa5eb484707 --- /dev/null +++ b/egs/iam/v2/local/process_data.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_data.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('out_dir', type=str, + help='Where to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.database_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + img_num = line[-3:] + doc = minidom.parse(xml_path) + + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder, innerfolder) + image_file_path = lines_path + img_num + '.png' + text = text_dict[line] + utt_id = writer_id + '_' + line + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py new file mode 100755 index 00000000000..1b414ef47f6 --- /dev/null +++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# Copyright 2018 Ashish Arora + +import argparse +import os +import numpy as np +import sys +import re + +parser = argparse.ArgumentParser(description="""Removes dev/test set lines + from the LOB corpus. Reads the + corpus from stdin, and writes it to stdout.""") +parser.add_argument('dev_text', type=str, + help='dev transcription location.') +parser.add_argument('test_text', type=str, + help='test transcription location.') +args = parser.parse_args() + +def remove_punctuations(transcript): + char_list = [] + for char in transcript: + if char.isdigit() or char == '+' or char == '~' or char == '?': + continue + if char == '#' or char == '=' or char == '-' or char == '!': + continue + if char == ',' or char == '.' or char == ')' or char == '\'': + continue + if char == '(' or char == ':' or char == ';' or char == '"': + continue + char_list.append(char) + return char_list + + +def remove_special_words(words): + word_list = [] + for word in words: + if word == '' or word == '#': + continue + word_list.append(word) + return word_list + + +# process and add dev/eval transcript in a list +# remove special words, punctuations, spaces between words +# lowercase the characters +def read_utterances(text_file_path): + with open(text_file_path, 'rt') as in_file: + for line in in_file: + words = line.strip().split() + words_wo_sw = remove_special_words(words) + transcript = ''.join(words_wo_sw[1:]) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + utterance_dict[words_wo_sw[0]] = transcript + + +### main ### + +# read utterances and add it to utterance_dict +utterance_dict = dict() +read_utterances(args.dev_text) +read_utterances(args.test_text) + +# read corpus and add it to below lists +corpus_text_lowercase_wo_sc = list() +corpus_text_wo_sc = list() +original_corpus_text = list() +for line in sys.stdin: + original_corpus_text.append(line) + words = line.strip().split() + words_wo_sw = remove_special_words(words) + + transcript = ''.join(words_wo_sw) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_lowercase_wo_sc.append(transcript) + + transcript = ''.join(words_wo_sw) + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_wo_sc.append(transcript) + +# find majority of utterances below +# for utterances which were not found +# add them to remaining_utterances +row_to_keep = [True for i in range(len(original_corpus_text))] +remaining_utterances = dict() +for line_id, line_to_find in utterance_dict.items(): + found_line = False + for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): + # Combine 3 consecutive lines of the corpus into a single line + prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() + curr_words = corpus_text_lowercase_wo_sc[i].strip() + next_words = corpus_text_lowercase_wo_sc[i + 1].strip() + new_line = prev_words + curr_words + next_words + transcript = ''.join(new_line) + if line_to_find in transcript: + found_line = True + row_to_keep[i-1] = False + row_to_keep[i] = False + row_to_keep[i+1] = False + if not found_line: + remaining_utterances[line_id] = line_to_find + + +for i in range(len(original_corpus_text)): + transcript = original_corpus_text[i].strip() + if row_to_keep[i]: + print(transcript) + +print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr) +print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr) +print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr) +print('LOB lines: Before: {} After: {}'.format(len(original_corpus_text), + row_to_keep.count(True)), file=sys.stderr) diff --git a/egs/iam/v2/local/remove_wellington_annotations.py b/egs/iam/v2/local/remove_wellington_annotations.py new file mode 100755 index 00000000000..260a3542985 --- /dev/null +++ b/egs/iam/v2/local/remove_wellington_annotations.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# Copyright 2018 Chun-Chieh Chang + +import sys +import io +import re +from collections import OrderedDict + +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); + +prev2_line = " "; +prev_line = " "; +for line in sys.stdin: + line = line.strip() + pattern = re.compile("\\*\\*\\[.*?\\*\\*\\]|\\*[0-9]|\\\\[0-9]{0,2}|\\*\\*?[\|,\?,\#,\=,\;,\:,\<,\>]|\||\^") + line_fixed = pattern.sub("", line) + dict=OrderedDict([("*+$","$"), ("*+","£"), ("*-","-"), ("*/","*"), ("*{","{"), ("*}","}"), + ("**\"","\""), ("*\"","\""), ("**'","'"), ("*'","'"), ("*@","°")]) + pattern = re.compile("|".join(re.escape(key) for key in dict.keys())); + line_fixed = pattern.sub(lambda x: dict[x.group()], line_fixed) + + line_fixed = prev2_line + "\n" + prev_line + "\n" + line_fixed + + pattern = re.compile("\{[0-9]{0,2}(.*?)\}", re.DOTALL) + line_fixed = pattern.sub(lambda x: x.group(1), line_fixed) + + output, prev2_line, prev_line = line_fixed.split("\n") + + sys.stdout.write(output + "\n") +sys.stdout.write(prev2_line + "\n") +sys.stdout.write(prev_line + "\n") diff --git a/egs/iam/v2/local/score.sh b/egs/iam/v2/local/score.sh new file mode 100755 index 00000000000..b2032909333 --- /dev/null +++ b/egs/iam/v2/local/score.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 + +# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the 's +# using local/unk_arc_post_to_transcription.py and also it calls +# steps/scoring/score_kaldi_cer.sh at the end. + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=3 +max_lmwt=13 +iter=final +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 +model_path=`echo $dir |xargs dirname` +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + + +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + compute-wer --text --mode=present \ + "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + + done +fi + + + +if [ $stage -le 1 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi +fi + +steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \ + --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \ + $data $lang_or_graph $dir + +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null + +exit 0; diff --git a/egs/iam/v2/local/srilm_train.sh b/egs/iam/v2/local/srilm_train.sh new file mode 100755 index 00000000000..50cae4af37c --- /dev/null +++ b/egs/iam/v2/local/srilm_train.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +order=8 + +. ./path.sh +. ./utils/parse_options.sh + +if [ $# -lt 1 ]; then + echo "Usage: $0 "; + exit 1; +fi + +dir=$1 + +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=`pwd`/../../../tools/srilm/bin/i686-m64 + else + sdir=`pwd`/../../../tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + + +set -o errexit +mkdir -p $dir +export LC_ALL=C + +src=data/local/local_lm/data/text +wordlist=data/local/local_lm/data/wordlist + +cat $src/{iam,lob,brown}.txt | gzip -c > $dir/train.gz +cp $src/dev.txt $dir/heldout + +ngram-count -text $dir/train.gz -order $order \ + -kndiscount -interpolate -lm $dir/sw1.o${order}g.kn.gz + +echo "PPL for ${order}-gram LM:" +ngram -order $order -lm $dir/sw1.o${order}g.kn.gz -ppl $dir/heldout +#ngram -lm $dir/sw1.o${order}g.kn.gz -ppl $dir/heldout -debug 2 >& $dir/3gram.ppl2 diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh new file mode 100755 index 00000000000..e8353635ae8 --- /dev/null +++ b/egs/iam/v2/local/train_lm.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains an LM on the LOB+Brown text data and IAM training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +vocab_size=50000 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Using LOB and brown corpus. + cat data/local/lob.txt | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/lob.txt + cat data/local/browncorpus/brown.txt | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/brown.txt + if [ -d "data/local/wellingtoncorpus" ]; then + cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/wellington.txt + fi + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/val/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/iam.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from IAM text + if [ -d "data/local/wellingtoncorpus" ]; then + cat ${dir}/data/text/{iam,lob,brown,wellington}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + else + echo "$0: Wellington Corpus not found. Proceeding without using that corpus." + cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + fi + head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=6 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='brown=2 lob=2 iam=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500,000 n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/iam/v2/local/wer_output_filter b/egs/iam/v2/local/wer_output_filter new file mode 100755 index 00000000000..24691a160a9 --- /dev/null +++ b/egs/iam/v2/local/wer_output_filter @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +# Sample BPE-based output: +# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch + +import sys +import re + +punctuations = "!(),.?;:'-\"" +escaped_punctuations = re.escape(punctuations) + +for line in sys.stdin: + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations), + transcript)).strip() + print("{} {}".format(uttid, split_transcript)) diff --git a/egs/iam/v2/path.sh b/egs/iam/v2/path.sh new file mode 100755 index 00000000000..7e458144624 --- /dev/null +++ b/egs/iam/v2/path.sh @@ -0,0 +1,9 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh + +export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/home/dpovey/libs:$LD_LIBRARY_PATH +export LC_ALL=C diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh new file mode 100755 index 00000000000..53c1481ecaf --- /dev/null +++ b/egs/iam/v2/run_end2end.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +set -e +stage=0 +nj=20 +username= +password= +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +iam_database=/export/corpora5/handwriting_ocr/IAM +# wellington_database points to the database path on the JHU grid. The Wellington +# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). +# This corpus is of written NZ English that can be purchased here: +# "https://www.victoria.ac.nz/lals/resources/corpora-default" +wellington_database=/export/corpora5/Wellington/WWC/ + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" \ + --wellington-dir "$wellington_database" \ + --username "$username" --password "$password" +fi +mkdir -p data/{train,test}/data + +if [ $stage -le 1 ]; then + image/get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$0: Preparing the test and train feature files..." + for dataset in train test; do + local/make_features.py data/$dataset --feat-dim 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp + steps/compute_cmvn_stats.sh data/$dataset + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | \ + local/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train val; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh +fi + +if [ $stage -le 4 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang + + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g +fi + +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali.sh +fi diff --git a/egs/iam/v2/steps b/egs/iam/v2/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/iam/v2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/iam/v2/utils b/egs/iam/v2/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/iam/v2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file From 3e0f6ae6588f44f156fc0c0f5aa93727ef0f6db7 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 7 Aug 2018 14:04:45 -0400 Subject: [PATCH 2/4] small fix --- egs/iam/v2/local/train_lm.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh index e8353635ae8..f7aa3472ca8 100755 --- a/egs/iam/v2/local/train_lm.sh +++ b/egs/iam/v2/local/train_lm.sh @@ -58,7 +58,12 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true # Using LOB and brown corpus. - cat data/local/lob.txt | \ + if [ ! -f data/local/lob-train-only.txt ]; then + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ + local/remove_test_utterances_from_lob.py data/test/text data/val/text \ + > data/local/lob-train-only.txt + fi + cat data/local/lob-train-only.txt | \ local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > ${dir}/data/text/lob.txt cat data/local/browncorpus/brown.txt | \ From c0a3c6db08f3aa6bbfbe5d6cb57c966a36a14301 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 7 Aug 2018 14:33:41 -0400 Subject: [PATCH 3/4] Remove mistakenly-added file --- egs/iam/v2/local/srilm_train.sh | 49 --------------------------------- 1 file changed, 49 deletions(-) delete mode 100755 egs/iam/v2/local/srilm_train.sh diff --git a/egs/iam/v2/local/srilm_train.sh b/egs/iam/v2/local/srilm_train.sh deleted file mode 100755 index 50cae4af37c..00000000000 --- a/egs/iam/v2/local/srilm_train.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -order=8 - -. ./path.sh -. ./utils/parse_options.sh - -if [ $# -lt 1 ]; then - echo "Usage: $0 "; - exit 1; -fi - -dir=$1 - -loc=`which ngram-count`; -if [ -z $loc ]; then - if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... - sdir=`pwd`/../../../tools/srilm/bin/i686-m64 - else - sdir=`pwd`/../../../tools/srilm/bin/i686 - fi - if [ -f $sdir/ngram-count ]; then - echo Using SRILM tools from $sdir - export PATH=$PATH:$sdir - else - echo You appear to not have SRILM tools installed, either on your path, - echo or installed in $sdir. See tools/install_srilm.sh for installation - echo instructions. - exit 1 - fi -fi - - -set -o errexit -mkdir -p $dir -export LC_ALL=C - -src=data/local/local_lm/data/text -wordlist=data/local/local_lm/data/wordlist - -cat $src/{iam,lob,brown}.txt | gzip -c > $dir/train.gz -cp $src/dev.txt $dir/heldout - -ngram-count -text $dir/train.gz -order $order \ - -kndiscount -interpolate -lm $dir/sw1.o${order}g.kn.gz - -echo "PPL for ${order}-gram LM:" -ngram -order $order -lm $dir/sw1.o${order}g.kn.gz -ppl $dir/heldout -#ngram -lm $dir/sw1.o${order}g.kn.gz -ppl $dir/heldout -debug 2 >& $dir/3gram.ppl2 From 466958e5b13f5f9e8a05b1de58e5f88dc32b67aa Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Wed, 8 Aug 2018 12:27:46 -0400 Subject: [PATCH 4/4] Minor fixes --- egs/iam/v2/local/chain/run_e2e_cnn.sh | 8 ++++++-- egs/iam/v2/local/train_lm.sh | 2 +- egs/iam/v2/run_end2end.sh | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh index 0a49e980be4..15bdf610cd3 100755 --- a/egs/iam/v2/local/chain/run_e2e_cnn.sh +++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh @@ -35,7 +35,8 @@ l2_regularize=0.00005 frames_per_iter=1000000 cmvn_opts="--norm-means=true --norm-vars=true" train_set=train -lang_test=lang_unk +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -155,7 +156,7 @@ if [ $stage -le 4 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -164,6 +165,9 @@ if [ $stage -le 5 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj 30 --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh index f7aa3472ca8..35eb56b1341 100755 --- a/egs/iam/v2/local/train_lm.sh +++ b/egs/iam/v2/local/train_lm.sh @@ -60,7 +60,7 @@ if [ $stage -le 0 ]; then # Using LOB and brown corpus. if [ ! -f data/local/lob-train-only.txt ]; then cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ - local/remove_test_utterances_from_lob.py data/test/text data/val/text \ + local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \ > data/local/lob-train-only.txt fi cat data/local/lob-train-only.txt | \ diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index 53c1481ecaf..de5c7086ec2 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -58,7 +58,7 @@ if [ $stage -le 2 ]; then for set in test train val; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.txt \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text