diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS index 2260a106654..e0fb9d38ceb 100644 --- a/egs/gale_arabic/s5b/RESULTS +++ b/egs/gale_arabic/s5b/RESULTS @@ -65,8 +65,30 @@ Combined Results for Reports and Conversational WER: %WER 32.36 [ 22542 / 69668, 2156 ins, 4184 del, 16202 sub ] exp/tri2b_mmi/decode_it4/wer_11 %WER 32.50 [ 22640 / 69668, 2393 ins, 3956 del, 16291 sub ] exp/tri2b_mmi/decode_it3/wer_11 %WER 32.79 [ 22847 / 69668, 2407 ins, 4760 del, 15680 sub ] exp/tri2b_mpe/decode_it3/wer_13 +# WER with train_sat_basis +%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 +# WER with train_sat %WER 33.61 [ 23413 / 69668, 2817 ins, 4577 del, 16019 sub ] exp/tri3b/decode/wer_17 %WER 35.73 [ 24894 / 69668, 2630 ins, 4944 del, 17320 sub ] exp/tri3b/decode.si/wer_15 %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16 %WER 39.42 [ 27462 / 69668, 2473 ins, 6051 del, 18938 sub ] exp/tri2a/decode/wer_14 %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13 + + +# Effect of GMM seed model (tri2b instead of tri3b). Using tri3b give a slightly better result +# as compared to using tri2b as seed. +%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3b_sp/decode_test/wer_10_0.0 +%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_2b_sp/decode_test/wer_9_0.0 + +# Effect of Tree-size (3500, 4500, 7000, 11000) +%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0 +%WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0 +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0 +%WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0 + +# Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002) +%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_005_sp/decode_test/wer_9_0.5 +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_002_sp/decode_test/wer_9_0.0 + +#current best 'chain' models (see local/chain/tuning/run_tdnn_1a.sh) +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0 diff --git a/egs/gale_arabic/s5b/cmd.sh b/egs/gale_arabic/s5b/cmd.sh index 71dd849a93b..ea341c98d4a 100755 --- a/egs/gale_arabic/s5b/cmd.sh +++ b/egs/gale_arabic/s5b/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="queue.pl --mem 2G" -export decode_cmd="queue.pl --mem 4G" -export mkgraph_cmd="queue.pl --mem 8G" +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="retry.pl queue.pl --mem 4G" +export mkgraph_cmd="retry.pl queue.pl --mem 8G" diff --git a/egs/gale_arabic/s5b/local/chain/compare_wer.sh b/egs/gale_arabic/s5b/local/chain/compare_wer.sh new file mode 100755 index 00000000000..1a40523355a --- /dev/null +++ b/egs/gale_arabic/s5b/local/chain/compare_wer.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# ./local/chain/compare_wer.sh exp/chain/cnn1a +# System cnn1a +# WER 0.61 +# CER 0.15 +# Final train prob -0.0377 +# Final valid prob -0.0380 +# Final train prob (xent) -0.0830 +# Final valid prob (xent) -0.0838 + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/gale_arabic/s5b/local/chain/run_chain_common.sh b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh new file mode 100755 index 00000000000..da37e148441 --- /dev/null +++ b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# this script has common stages shared across librispeech chain recipes. +# It generates a new topology in a new lang directory, gets the alignments as +# lattices, and builds a tree for the new topology +set -e + +stage=11 + +# input directory names. These options are actually compulsory, and they have +# been named for convenience +gmm_dir= +ali_dir= +lores_train_data_dir= + +num_leaves=6000 + +# output directory names. They are also compulsory. +lang= +lat_dir= +tree_dir= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1; +[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1; +[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1; + +for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + +if [ $stage -le 11 ]; then + echo "$0: creating lang directory with one state per phone." + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + if [ -d $lang ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then + echo "$0: $lang already exists, not overwriting it; continuing" + else + echo "$0: $lang already exists and seems to be older than data/lang..." + echo " ... not sure what to do. Exiting." + exit 1; + fi + else + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo + fi +fi + +if [ $stage -le 12 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + nj=$(cat ${ali_dir}/num_jobs) || exit 1; + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + $lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 13 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir +fi + +exit 0; diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh index 7afafb31ff6..a3ccfda04ac 100755 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh @@ -1,31 +1,51 @@ #!/bin/bash -#started from tedlium recipe with few edits +# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp +# System tdnn_1a_sp +# WER 16.47 +# CER 6.68 +# Final train prob -0.0652 +# Final valid prob -0.0831 +# Final train prob (xent) -0.8965 +# Final valid prob (xent) -0.9964 +# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/ +# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083) -set -e -o pipefail -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). +set -e -o pipefail stage=0 nj=30 -decode_nj=30 -min_seg_len=1.55 -xent_regularize=0.1 train_set=train -gmm=tri2b # the gmm for the target data +test_set=test +gmm=tri3b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 #default -10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix=1b #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir= # you can set this to use previously dumped egs. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=150,110,100 +get_egs_stage=-10 + +# training options +srand=0 +remove_egs=true +run_ivector_common=true +run_chain_common=true # End configuration section. echo "$0 $@" # Print the command line for logging + . ./cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -39,169 +59,162 @@ where "nvcc" is installed. EOF fi -local/nnet3/run_ivector_common.sh --stage $stage \ - --nj $nj \ - --min-seg-len $min_seg_len \ - --train-set $train_set \ - --gmm $gmm \ - --num-threads-ubm $num_threads_ubm \ - --nnet3-affix "$nnet3_affix" - - -gmm_dir=exp/$gmm -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb -tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix} -lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats -dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi -train_data_dir=data/${train_set}_sp_hires_comb -lores_train_data_dir=data/${train_set}_sp_comb -train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb - +if $run_ivector_common; then + local/nnet3/run_ivector_common.sh \ + --stage $stage --nj $nj \ + --train-set $train_set --gmm $gmm \ + --num-threads-ubm $num_threads_ubm \ + --nnet3-affix "$nnet3_affix" +fi -for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ - $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats +dir=exp/chain${nnet3_affix}/tdnn${affix}_sp +train_data_dir=data/${train_set}_sp_hires +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp + +# note: you don't necessarily have to change the treedir name +# each time you do a new experiment-- only if you change the +# configuration in a way that affects the tree. +tree_dir=exp/chain${nnet3_affix}/tree_a_sp +# the 'lang' directory is created by this script. +# If you create such a directory with a non-standard topology +# you should probably name it differently. +lang=data/lang_chain + +for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ + $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \ + $ali_dir/ali.1.gz $gmm_dir/final.mdl; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 done -if [ $stage -le 14 ]; then - echo "$0: creating lang directory with one state per phone." - # Create a version of the lang/ directory that has one state per phone in the - # topo file. [note, it really has two states.. the first one is only repeated - # once, the second one has zero or more repeats.] - if [ -d data/lang_chain ]; then - if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then - echo "$0: data/lang_chain already exists, not overwriting it; continuing" - else - echo "$0: data/lang_chain already exists and seems to be older than data/lang..." - echo " ... not sure what to do. Exiting." - exit 1; - fi - else - cp -r data/lang data/lang_chain - silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1; - nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1; - # Use our special topology... note that later on may have to tune this - # topology. - steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo - fi +# Please take this as a reference on how to specify all the options of +# local/chain/run_chain_common.sh +if $run_chain_common; then + local/chain/run_chain_common.sh --stage $stage \ + --gmm-dir $gmm_dir \ + --ali-dir $ali_dir \ + --lores-train-data-dir ${lores_train_data_dir} \ + --lang $lang \ + --lat-dir $lat_dir \ + --num-leaves 7000 \ + --tree-dir $tree_dir || exit 1; fi if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - -if [ $stage -le 17 ]; then mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" mkdir -p $dir/configs + cat < $dir/configs/network.xconfig input dim=100 name=ivector input dim=40 name=input - # please note that it is important to have input layer with the name=input # as the layer immediately preceding the fixed-affine-layer to enable # the use of short notation for the descriptor fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=450 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450 - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450 - - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ - fi -if [ $stage -le 18 ]; then + +if [ $stage -le 16 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi - steps/nnet3/chain/train.py --stage $train_stage \ + steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ --feat.online-ivector-dir $train_ivector_dir \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ + --chain.l2-regularize 0.0 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 6 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 2 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.stage $get_egs_stage \ + --reporting.email="$reporting_email" \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir -fi - + --lat-dir=$lat_dir \ + --dir $dir || exit 1; +fi -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph || exit 1; fi -if [ $stage -le 20 ]; then +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) rm $dir/.error 2>/dev/null || true - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/test_hires $dir/decode || exit 1; + + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 0 --extra-right-context 0 \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \ + $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1 fi -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh b/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh deleted file mode 100755 index 0125272d06c..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - - -galeData=$(utils/make_absolute.sh "${@: -1}" ); # last argumnet; the local folder -audio_dvds=${@:1:${#}-1} # all the audio dvds for GALE corpus; ; check audio=( in ../run.sh - -mkdir -p $galeData - -# check that sox is installed -which sox &>/dev/null -if [[ $? != 0 ]]; then - echo "sox is not installed"; exit 1 -fi - -for dvd in $audio_dvds; do - dvd_full_path=$(utils/make_absolute.sh $dvd) - if [[ ! -e $dvd_full_path ]]; then - echo missing $dvd_full_path; exit 1; - fi - find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do - id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}') - echo "$id sox $file -r 16000 -t wav - |" - done -done | sort -u > $galeData/wav.scp - -echo data prep audio succeded - -exit 0 - diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh b/egs/gale_arabic/s5b/local/gale_data_prep_split.sh deleted file mode 100755 index b18a4e5b105..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -if [ $# -ne 1 ]; then - echo "Arguments should be the "; exit 1 -fi - - -#data will data/local - -galeData=$(utils/make_absolute.sh $1) -mkdir -p data/local -dir=$(utils/make_absolute.sh data/local) - - -grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test -grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train - -for x in test train; do - outdir=$dir/$x - file=$galeData/all.$x - mkdir -p $outdir - awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk - cp -pr $outdir/utt2spk $outdir/spk2utt - awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments - awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text -done - - -grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp - -cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} - {if (seen[$1]) { print $0}}' > $dir/train/wav.scp - -echo data prep split succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh b/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh deleted file mode 100755 index 04529d88ac0..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -galeData=$(utils/make_absolute.sh "${@: -1}" ); # last argumnet; the local folder -txt_dvds=${@:1:${#}-1} # all the txt cds correspoding to the audio corpus; check text=( in ../run.sh - - -top_pwd=`pwd` -txtdir=$galeData/txt -mkdir -p $txtdir; cd $txtdir - -for cdx in $txt_dvds; do - echo "Preparing $cdx" - if [[ $cdx == *.tgz ]] ; then - tar -xvf $cdx - elif [ -d "$cdx" ]; then - ln -s $cdx `basename $cdx` - else - echo "I don't really know what I shall do with $cdx " >&2 - fi -done - -find -L . -type f -name "*.tdf" | while read file; do -sed '1,3d' $file # delete the first 3 lines -done > all.tmp$$ - -perl -e ' - ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0]; - open(IN, "$inFile"); - open(ID, ">$idFile"); - open(TXT, ">$txtFile"); - while () { - @arr= split /\t/,$_; - $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning - $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; - if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";} - $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n"; - next if ($rStart == $rEnd); - $id =~ s/.sph//g; - print ID $id; - print TXT "$arr[7]\n"; - }' "all.tmp$$ allid.tmp$$ contentall.tmp$$" - - -perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$ - -paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$ - -awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all -awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/report -awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational - -cd ..; -rm -fr $txtdir -cd $top_pwd -echo data prep text succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_format_data.sh b/egs/gale_arabic/s5b/local/gale_format_data.sh deleted file mode 100755 index b69c34e68b9..00000000000 --- a/egs/gale_arabic/s5b/local/gale_format_data.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -if [ -f path.sh ]; then - . ./path.sh; else - echo "$0: missing path.sh"; exit 1; -fi - -for dir in test train; do - cp -pr data/local/$dir data/$dir -done - - -mkdir -p data/lang_test - -arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -r data/lang_test -cp -r data/lang data/lang_test - -gunzip -c "$arpa_lm" | \ - arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst - - -echo "$0: Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst - -## Check lexicon. -## just have a look and make sure it seems sane. -echo "$0: First few lines of lexicon FST:" -fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head - -echo "$0: Performing further checks" - -# Checking that G.fst is determinizable. -fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. - -# Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. - -# Checking that disambiguated lexicon times G is determinizable -# Note: we do this with fstdeterminizestar not fstdeterminize, as -# fstdeterminize was taking forever (presumbaly relates to a bug -# in this version of OpenFst that makes determinization slow for -# some case). -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ - fstdeterminizestar >/dev/null || echo Error - -# Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ - fstisstochastic || echo LG is not stochastic - - -echo gale_format_data succeeded. - -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh b/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh deleted file mode 100755 index 5f101f8245b..00000000000 --- a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# Copyright 2017 QCRI (author: Ahmed Ali) -# Apache 2.0 - - -# run this from ../ -dir=$(utils/make_absolute.sh data/local/dict) -mkdir -p $dir - - -# (1) Get all avaialble dictionaries, since this is a grapheme model, so we mainly need the most frequent word lists -wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2 || exit 1; -wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2 || exit 1; -bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > tmp$$ -bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> tmp$$ -# (2) Now we add all the words appeared in the training data -cat data/local/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$ -grep -v [0-9] tmp$$ | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and rare alef wasla -cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's: : :g' -e 's: : :g' -e 's:\s*: :g' -e 's:\*:V:g' > tmp2.$$ -paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt - -#(2) Dictionary preparation: - -# silence phones, one per line. -echo SIL > $dir/silence_phones.txt -echo SIL > $dir/optional_silence.txt - -# nonsilence phones; on each line is a list of phones that correspond -# really to the same base phone. -cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$ | sort -u > $dir/nonsilence_phones.txt || exit 1; - -sed -i '1i SIL' $dir/lexicon.txt # insert word with phone sil at the begining of the dictionary - -rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ -echo Dictionary preparation succeeded - -# The script is still missing dates and numbers - -exit 0 - diff --git a/egs/gale_arabic/s5b/local/gale_train_lms.sh b/egs/gale_arabic/s5b/local/gale_train_lms.sh deleted file mode 100755 index 3988ec3818f..00000000000 --- a/egs/gale_arabic/s5b/local/gale_train_lms.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - - -# To be run from one directory above this script. - - -lexicon=data/local/dict/lexicon.txt -[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1; - - -# This script takes no arguments. It assumes you have already run -# previus steps successfully -# It takes as input the files -#data/local/train.*/text -#data/local/dict/lexicon.txt - - -export LC_ALL=C # You'll get errors about things being not sorted, if you -# have a different locale. -export PATH=$PATH:./../../../tools/kaldi_lm -( # First make sure the kaldi_lm toolkit is installed. - cd $KALDI_ROOT/tools || exit 1; - if [ -d kaldi_lm ]; then - echo Not installing the kaldi_lm toolkit since it is already there. - else - echo Downloading and installing the kaldi_lm tools - if [ ! -f kaldi_lm.tar.gz ]; then - wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; - fi - tar -xvzf kaldi_lm.tar.gz || exit 1; - cd kaldi_lm - make || exit 1; - echo Done making the kaldi_lm tools - fi -) || exit 1; - - -dir=data/local/lm - mkdir -p $dir - text=data/local/train/text - [ ! -f $text ] && echo "$0: No such file $text" && exit 1; - - cleantext=$dir/text.no_oov - - cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ",$n);} } printf("\n");}' \ - > $cleantext || exit 1; - - - cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). - cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -# note: we probably won't really make use of as there aren't any OOVs - cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ - || exit 1; - -# note: ignore 1st field of train.txt, it's the utterance-id. - cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} - { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ - || exit 1; - - train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; - -# LM is small enough that we don't need to prune it (only about 0.7M N-grams). -# Perplexity over 128254.000000 words is 90.446690 - -# note: output is -# data/local/lm/3gram-mincount/lm_unpruned.gz - - -echo train lm succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh index f14c8441869..f071842dc0b 100755 --- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh +++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh @@ -2,31 +2,29 @@ set -e -o pipefail -# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually -# be called by more scripts). It contains the common feature preparation and iVector-related parts -# of the script. See those scripts for examples of usage. +# This script is called from scripts like local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It +# contains the common feature preparation and iVector-related parts of the +# script. See those scripts for examples of usage. stage=0 nj=100 -min_seg_len=1.55 # min length in seconds... we do this because chain training - # will discard segments shorter than 1.5 seconds. Must remain in sync - # with the same option given to prepare_lores_feats_and_alignments.sh train_set=train # you might set this to e.g. train. -gmm=tri2b # This specifies a GMM-dir from the features of the type you're training the system on; +test_sets="test" +gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on; # it should contain alignments for 'train_set'. num_threads_ubm=32 -nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it - # becomes exp/nnet3_cleaned or whatever. +nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff . ./cmd.sh . ./path.sh -. ./utils/parse_options.sh +. utils/parse_options.sh gmm_dir=exp/${gmm} -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do if [ ! -f $f ]; then @@ -61,7 +59,7 @@ if [ $stage -le 2 ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi - for datadir in ${train_set}_sp test; do + for datadir in ${train_set}_sp ${test_sets}; do utils/copy_data_dir.sh data/$datadir data/${datadir}_hires done @@ -69,7 +67,7 @@ if [ $stage -le 2 ]; then # features; this helps make trained nnets more invariant to test data volume. utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires - for datadir in ${train_set}_sp test; do + for datadir in ${train_set}_sp ${test_sets}; do steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/${datadir}_hires steps/compute_cmvn_stats.sh data/${datadir}_hires @@ -78,75 +76,33 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then - echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" - # we have to combine short segments or we won't be able to train chain models - # on those segments. - utils/data/combine_short_segments.sh \ - data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/ - utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/ -fi - -if [ $stage -le 4 ]; then - echo "$0: selecting segments of hires training data that were also present in the" - echo " ... original training data." - - # note, these data-dirs are temporary; we put them in a sub-directory - # of the place where we'll make the alignments. - temp_data_root=exp/nnet3${nnet3_affix}/tri5 - mkdir -p $temp_data_root - - utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \ - data/${train_set}_sp_hires $temp_data_root/${train_set}_hires - - # note: essentially all the original segments should be in the hires data. - n1=$(wc -l /dev/null +if [[ $? != 0 ]]; then + echo "$0: sox is not installed"; exit 1 +fi + +for dvd in $dir1 $dir2 $dir3; do + dvd_full_path=$(utils/make_absolute.sh $dvd) + if [[ ! -e $dvd_full_path ]]; then + echo "$0: missing $dvd_full_path"; exit 1; + fi + find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do + id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}') + echo "$id sox $file -r 16000 -t wav - |" + done +done | sort -u > $gale_data/wav.scp +echo "$0:data prep audio succeded" + +gale_data=$(utils/make_absolute.sh "GALE" ); +top_pwd=`pwd` +txtdir=$gale_data/txt +mkdir -p $txtdir; cd $txtdir + +for cdx in $text1 $text2 $text3; do + echo "$0:Preparing $cdx" + if [[ $cdx == *.tgz ]] ; then + tar -xvf $cdx + elif [ -d "$cdx" ]; then + ln -s $cdx `basename $cdx` + else + echo "$0:I don't really know what I shall do with $cdx " >&2 + fi +done + +find -L . -type f -name "*.tdf" | while read file; do +sed '1,3d' $file # delete the first 3 lines +done > all.tmp$$ + +perl -e ' + ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0]; + open(IN, "$inFile"); + open(ID, ">$idFile"); + open(TXT, ">$txtFile"); + while () { + @arr= split /\t/,$_; + $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning + $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; + if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";} + $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n"; + next if ($rStart == $rEnd); + $id =~ s/.sph//g; + print ID $id; + print TXT "$arr[7]\n"; + }' "all.tmp$$ allid.tmp$$ contentall.tmp$$" + +perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$ +paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$ + + +awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all +awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/report +awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational + +cd ..; +rm -fr $txtdir +cd $top_pwd +echo "$0:dat a prep text succeeded" + +mkdir -p data +dir=$(utils/make_absolute.sh data/) +grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test +grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train + +for x in test train; do + outdir=data/$x + file=$gale_data/all.$x + mkdir -p $outdir + awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk + cp -pr $outdir/utt2spk $outdir/spk2utt + awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments + awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text +done + +grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp + +cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} + {if (seen[$1]) { print $0}}' > $dir/train/wav.scp + +echo "$0:data prep split succeeded" +exit 0 diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh new file mode 100755 index 00000000000..47b5869fdf1 --- /dev/null +++ b/egs/gale_arabic/s5b/local/prepare_dict.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# Copyright 2017 QCRI (author: Ahmed Ali) +# Apache 2.0 +# This script prepares the dictionary. + +set -e +dir=data/local/dict +lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2"; +lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2"; +stage=0 +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; +mkdir -p $dir data/local/lexicon_data + +if [ $stage -le 0 ]; then + echo "$0: Downloading text for lexicon... $(date)." + wget -P data/local/lexicon_data $lexicon_url1 + wget -P data/local/lexicon_data $lexicon_url2 + bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > data/local/lexicon_data/grapheme_lexicon + bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> data/local/lexicon_data/grapheme_lexicon + cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon +fi + + +if [ $stage -le 0 ]; then + echo "$0: processing lexicon text and creating lexicon... $(date)." + # remove vowels and rare alef wasla + grep -v [0-9] data/local/lexicon_data/grapheme_lexicon | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon + local/prepare_lexicon.py +fi + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +sed -i '1i UNK' $dir/lexicon.txt + +echo UNK >> $dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt + +echo "$0: Dictionary preparation succeeded" diff --git a/egs/gale_arabic/s5b/local/prepare_lexicon.py b/egs/gale_arabic/s5b/local/prepare_lexicon.py new file mode 100755 index 00000000000..215541585eb --- /dev/null +++ b/egs/gale_arabic/s5b/local/prepare_lexicon.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 + +# This script prepares lexicon. + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon') +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + characters = list(line) + characters = " ".join(['V' if char == '*' else char for char in characters]) + lex[line] = characters + +with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh new file mode 100755 index 00000000000..6fdf35f471a --- /dev/null +++ b/egs/gale_arabic/s5b/local/prepare_lm.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright 2012 Vassil Panayotov +# 2017 Ewald Enzinger +# Apache 2.0 + +. ./path.sh || exit 1 + +echo "=== Building a language model ..." + +dir=data/local/lm/ +text=data/train/text +lexicon=data/local/dict/lexicon.txt +# Language model order +order=3 + +. utils/parse_options.sh + +# Prepare a LM training corpus from the transcripts +mkdir -p $dir + +for f in "$text" "$lexicon"; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 + else + sdir=$KALDI_ROOT/tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + +cat data/train/text | cut -d " " -f 2- > $dir/text.txt +cut -d' ' -f1 $lexicon > $dir/wordlist + +ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \ + -unk -map-unk "" -kndiscount -interpolate -lm $dir/lm.gz + +#ngram -lm $dir/lm.gz -ppl $dir/dev.txt +echo "*** Finished building the LM model!" diff --git a/egs/gale_arabic/s5b/local/score.sh b/egs/gale_arabic/s5b/local/score.sh index 83366f7c7fc..1d84815fc69 100755 --- a/egs/gale_arabic/s5b/local/score.sh +++ b/egs/gale_arabic/s5b/local/score.sh @@ -1,60 +1,6 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -[ -f ./path.sh ] && . ./path.sh - -# begin configuration section. -cmd=run.pl -stage=0 -decode_mbr=true -word_ins_penalty=0.0 -min_lmwt=7 -max_lmwt=17 -iter= #some of the scripts from steps/ seem to use it -#end configuration section. - -echo "$0 $#" - -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --stage (0|1|2) # start scoring script from part-way through." - echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." - echo " --min_lmwt # minumum LM-weight for lattice rescoring " - echo " --max_lmwt # maximum LM-weight for lattice rescoring " - exit 1; -fi -data=$1 -lang_or_graph=$2 -dir=$3 - -symtab=$lang_or_graph/words.txt - -for f in $symtab $dir/lat.1.gz $data/text; do - [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; -done - -mkdir -p $dir/scoring/log - -cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt - -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \ - lattice-best-path --word-symbol-table=$symtab \ - ark:- ark,t:$dir/scoring/LMWT.tra || exit 1; +#!/bin/bash -# Note: the double level of quoting for the sed command -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ - cat $dir/scoring/LMWT.tra \| \ - utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ - compute-wer --text --mode=present \ - ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; -exit 0; +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter new file mode 100755 index 00000000000..cf48b434144 --- /dev/null +++ b/egs/gale_arabic/s5b/local/wer_output_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 +# This script converts a BPE-encoded text to normal text. It is used in scoring + +import sys, io +import string + +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +for line in infile: + words = line.strip().split() + words = [word for word in words if '' not in word] + uttid = words[0] + transcript = ' '.join(words[1:]) + output.write(uttid + ' ' + transcript + '\n') diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh index c45f5119949..3f12d22495e 100755 --- a/egs/gale_arabic/s5b/run.sh +++ b/egs/gale_arabic/s5b/run.sh @@ -3,177 +3,121 @@ # Copyright 2014 QCRI (author: Ahmed Ali) # Apache 2.0 -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. num_jobs=120 num_decode_jobs=40 +decode_gmm=true +stage=0 +overwrite=false -#NB: You can add whatever number of copora you like. The supported extensions -#NB: (formats) are wav and flac. Flac will be converted using sox and in contrast -#NB: with the old approach, the conversion will be on-the-fly and one-time-only -#NB: during the parametrization. - -#NB: Text corpora scpecification. We support either tgz files, which are unpacked -#NB: or just plain (already unpacked) directories. The list of transcript is then -#NB: obtained using find command - -#Make sure you edit this section to reflect whers you keep the LDC data on your cluster - -#This is CLSP configuration. We add the 2014 GALE data. We got around 2 % -#improvement just by including it. The gain might be large if someone would tweak -# the number of leaves and states and so on. - -#audio=( -# /export/corpora/LDC/LDC2013S02/ -# /export/corpora/LDC/LDC2013S07/ -# /export/corpora/LDC/LDC2014S07/ -#) -#text=( -# /export/corpora/LDC/LDC2013T17 -# /export/corpora/LDC/LDC2013T04 -# /export/corpora/LDC/LDC2014T17 -#) - -audio=( - /data/sls/scratch/amali/data/GALE/LDC2013S02 - /data/sls/scratch/amali/data/GALE/LDC2013S07 - /data/sls/scratch/amali/data/GALE/LDC2014S07 -) -text=( - /data/sls/scratch/amali/data/GALE/LDC2013T17.tgz - /data/sls/scratch/amali/data/GALE/LDC2013T04.tgz - /data/sls/scratch/amali/data/GALE/LDC2014T17.tgz -) +dir1=/export/corpora/LDC/LDC2013S02/ +dir2=/export/corpora/LDC/LDC2013S07/ +dir3=/export/corpora/LDC/LDC2014S07/ +text1=/export/corpora/LDC/LDC2013T17/ +text2=/export/corpora/LDC/LDC2013T04/ +text3=/export/corpora/LDC/LDC2014T17/ galeData=GALE -#prepare the data -#split train dev test -#prepare lexicon and LM - -# You can run the script from here automatically, but it is recommended to run the data preparation, -# and features extraction manually and and only once. -# By copying and pasting into your shell. - -#copy the audio files to local folder wav and convet flac files to wav -local/gale_data_prep_audio.sh "${audio[@]}" $galeData || exit 1; +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. -#get the transcription and remove empty prompts and all noise markers -local/gale_data_prep_txt.sh "${text[@]}" $galeData || exit 1; +if [ $stage -le 0 ]; then -# split the data to reports and conversational and for each class will have rain/dev and test -local/gale_data_prep_split.sh $galeData || exit 1; + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi -# get all Arabic grapheme dictionaries and add silence and UNK -local/gale_prep_grapheme_dict.sh || exit 1; + echo "$0: Preparing data..." + local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \ + --text1 $text1 --text2 $text2 --text3 $text3 + echo "$0: Preparing lexicon and LM..." + local/prepare_dict.sh -#prepare the langauge resources -utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang || exit 1; + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang -# LM training -local/gale_train_lms.sh || exit 1; + local/prepare_lm.sh -local/gale_format_data.sh || exit 1; -# G compilation, check LG composition + utils/format_lm.sh data/lang data/local/lm/lm.gz \ + data/local/dict/lexicon.txt data/lang_test +fi -# Now make MFCC features. -# mfccdir should be some place with a largish disk where you -# want to store MFCC features. mfccdir=mfcc - -for x in train test ; do - steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ - data/$x exp/make_mfcc/$x $mfccdir - utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir -done - - -# Here we start the AM - -# Let's create a subset with 10k segments to make quick flat-start training: -utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; - -# Train monophone models on a subset of the data, 10K segment -# Note: the --boost-silence option should probably be omitted by default -steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ - data/train.10K data/lang exp/mono || exit 1; - - -# Get alignments from monophone system. -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/mono exp/mono_ali || exit 1; - -# train tri1 [first triphone pass] -steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; - -# First triphone decoding -utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri1/graph data/test exp/tri1/decode - -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/tri1 exp/tri1_ali || exit 1; - -# Train tri2a, which is deltas+delta+deltas -steps/train_deltas.sh --cmd "$train_cmd" \ - 3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1; - -# tri2a decoding -utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri2a/graph data/test exp/tri2a/decode - -# train and decode tri2b [LDA+MLLT] -steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ - data/train data/lang exp/tri1_ali exp/tri2b || exit 1; - -utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri2b/graph data/test exp/tri2b/decode - -# Align all data with LDA+MLLT system (tri2b) -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; - - -# From 2b system, train 3b which is LDA + MLLT + SAT. -steps/train_sat.sh --cmd "$train_cmd" \ - 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; - -utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph -steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \ - "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode - -# From 3b system, align all data. -steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; - - -# nnet3 cross-entropy -local/nnet3/run_tdnn.sh #tdnn recipe: -local/nnet3/run_lstm.sh --stage 12 #lstm recipe (we skip ivector training) - -# chain lattice-free -local/chain/run_tdnn.sh #tdnn recipe: -local/chain/run_tdnn_lstm.sh #tdnn-lstm recipe: - -time=$(date +"%Y-%m-%d-%H-%M-%S") - -#get detailed WER; reports, conversational and combined -local/split_wer.sh $galeData > RESULTS.details.$USER.$time # to make sure you keep the results timed and owned - -echo training succedded +if [ $stage -le 1 ]; then + echo "$0: Preparing the test and train feature files..." + for x in train test ; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ + data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + done +fi + +if [ $stage -le 2 ]; then + echo "$0: creating sub-set and training monophone system" + utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; + + steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ + data/train.10K data/lang exp/mono || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: Aligning data using monophone system" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/mono exp/mono_ali || exit 1; + + echo "$0: training triphone system with delta features" + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; +fi + +if [ $stage -le 4 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph + steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ + exp/tri1/graph data/test exp/tri1/decode +fi + +if [ $stage -le 5 ]; then + echo "$0: Aligning data and retraining and realigning with lda_mllt" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ + data/train data/lang exp/tri1_ali exp/tri2b || exit 1; +fi + +if [ $stage -le 6 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph + steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b/decode +fi + +if [ $stage -le 7 ]; then + echo "$0: Aligning data and retraining and realigning with sat_basis" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; + + steps/train_sat_basis.sh --cmd "$train_cmd" \ + 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; + + steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; +fi + +if [ $stage -le 8 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph + steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \ + "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode +fi + +if [ $stage -le 9 ]; then + echo "$0: Training a regular chain model using the e2e alignments..." + local/chain/run_tdnn.sh +fi + +echo "$0: training succedded" exit 0 - -#TODO: -#LM (4-gram and RNN) rescoring -#combine lattices -#dialect detection - - - - -