From b93789fff00a18563275374e1beed4ac30272df8 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Dec 2018 20:20:04 -0500 Subject: [PATCH 01/13] updating gale setup --- egs/gale_arabic/s5b/local/chain/copare_wer.sh | 72 ++++++ .../s5b/local/chain/run_chain_common.sh | 82 ++++++ .../s5b/local/chain/tuning/run_tdnn_1b.sh | 220 ++++++++++++++++ .../s5b/local/gale_data_prep_audio.sh | 32 --- .../s5b/local/gale_data_prep_split.sh | 39 --- .../s5b/local/gale_data_prep_txt.sh | 60 ----- egs/gale_arabic/s5b/local/gale_format_data.sh | 60 ----- egs/gale_arabic/s5b/local/gale_train_lms.sh | 81 ------ .../s5b/local/nnet3/run_ivector_common.sh | 136 +++------- egs/gale_arabic/s5b/local/prepare_data.sh | 105 ++++++++ ..._prep_grapheme_dict.sh => prepare_dict.sh} | 26 +- egs/gale_arabic/s5b/local/prepare_lm.sh | 46 ++++ egs/gale_arabic/s5b/local/score.sh | 60 +---- egs/gale_arabic/s5b/run.sh | 239 +++++++----------- 14 files changed, 664 insertions(+), 594 deletions(-) create mode 100755 egs/gale_arabic/s5b/local/chain/copare_wer.sh create mode 100755 egs/gale_arabic/s5b/local/chain/run_chain_common.sh create mode 100755 egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh delete mode 100755 egs/gale_arabic/s5b/local/gale_data_prep_audio.sh delete mode 100755 egs/gale_arabic/s5b/local/gale_data_prep_split.sh delete mode 100755 egs/gale_arabic/s5b/local/gale_data_prep_txt.sh delete mode 100755 egs/gale_arabic/s5b/local/gale_format_data.sh delete mode 100755 egs/gale_arabic/s5b/local/gale_train_lms.sh create mode 100755 egs/gale_arabic/s5b/local/prepare_data.sh rename egs/gale_arabic/s5b/local/{gale_prep_grapheme_dict.sh => prepare_dict.sh} (61%) create mode 100755 egs/gale_arabic/s5b/local/prepare_lm.sh diff --git a/egs/gale_arabic/s5b/local/chain/copare_wer.sh b/egs/gale_arabic/s5b/local/chain/copare_wer.sh new file mode 100755 index 00000000000..1a40523355a --- /dev/null +++ b/egs/gale_arabic/s5b/local/chain/copare_wer.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# ./local/chain/compare_wer.sh exp/chain/cnn1a +# System cnn1a +# WER 0.61 +# CER 0.15 +# Final train prob -0.0377 +# Final valid prob -0.0380 +# Final train prob (xent) -0.0830 +# Final valid prob (xent) -0.0838 + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/gale_arabic/s5b/local/chain/run_chain_common.sh b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh new file mode 100755 index 00000000000..da37e148441 --- /dev/null +++ b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# this script has common stages shared across librispeech chain recipes. +# It generates a new topology in a new lang directory, gets the alignments as +# lattices, and builds a tree for the new topology +set -e + +stage=11 + +# input directory names. These options are actually compulsory, and they have +# been named for convenience +gmm_dir= +ali_dir= +lores_train_data_dir= + +num_leaves=6000 + +# output directory names. They are also compulsory. +lang= +lat_dir= +tree_dir= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1; +[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1; +[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1; + +for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + +if [ $stage -le 11 ]; then + echo "$0: creating lang directory with one state per phone." + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + if [ -d $lang ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then + echo "$0: $lang already exists, not overwriting it; continuing" + else + echo "$0: $lang already exists and seems to be older than data/lang..." + echo " ... not sure what to do. Exiting." + exit 1; + fi + else + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo + fi +fi + +if [ $stage -le 12 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + nj=$(cat ${ali_dir}/num_jobs) || exit 1; + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + $lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 13 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir +fi + +exit 0; diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..caa0d9d805e --- /dev/null +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,220 @@ +#!/bin/bash + +# ./local/chain/compare_wer.sh exp/chain/tdnn_1b/ +# System tdnn_1b +# WER 17.23 +# CER 6.83 +# Final train prob -0.0825 +# Final valid prob -0.0987 +# Final train prob (xent) -0.6611 +# Final valid prob (xent) -0.7393 + +# head exp/chain/tdnn_1b/decode_test_rnnlm_1e_2_0.40/scoring_kaldi/best_wer +# WER 16.58 [ 11549 / 69668, 1290 ins, 2389 del, 7870 sub ] exp/chain/tdnn1c_swbd_sp/decode_test_rnnlm_1e_2_0.40/wer_10_0.5 + +# exp/chain/tdnn_1b/: num-iters=441 nj=3..16 num-params=16.5M dim=40+100->1792 combine=-0.081->-0.081 (over 6) xent:train/valid[293,440,final]=(-0.937,-0.659,-0.661/-0.960,-0.739,-0.739) logprob:train/valid[293,440,final]=(-0.124,-0.083,-0.083/-0.127,-0.100,-0.099) + +set -e -o pipefail +stage=0 +nj=30 +train_set=train +test_set=test +gmm=tri2b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=150,110,100 +get_egs_stage=-10 + +# training options +srand=0 +remove_egs=true +run_ivector_common=true +run_chain_common=true +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 6 \ + --trainer.frames-per-iter 1500000 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.stage $get_egs_stage \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir=$lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang \ + $tree_dir $tree_dir/graph || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 0 --extra-right-context 0 \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \ + $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1 +fi diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh b/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh deleted file mode 100755 index 0125272d06c..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - - -galeData=$(utils/make_absolute.sh "${@: -1}" ); # last argumnet; the local folder -audio_dvds=${@:1:${#}-1} # all the audio dvds for GALE corpus; ; check audio=( in ../run.sh - -mkdir -p $galeData - -# check that sox is installed -which sox &>/dev/null -if [[ $? != 0 ]]; then - echo "sox is not installed"; exit 1 -fi - -for dvd in $audio_dvds; do - dvd_full_path=$(utils/make_absolute.sh $dvd) - if [[ ! -e $dvd_full_path ]]; then - echo missing $dvd_full_path; exit 1; - fi - find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do - id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}') - echo "$id sox $file -r 16000 -t wav - |" - done -done | sort -u > $galeData/wav.scp - -echo data prep audio succeded - -exit 0 - diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh b/egs/gale_arabic/s5b/local/gale_data_prep_split.sh deleted file mode 100755 index b18a4e5b105..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -if [ $# -ne 1 ]; then - echo "Arguments should be the "; exit 1 -fi - - -#data will data/local - -galeData=$(utils/make_absolute.sh $1) -mkdir -p data/local -dir=$(utils/make_absolute.sh data/local) - - -grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test -grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train - -for x in test train; do - outdir=$dir/$x - file=$galeData/all.$x - mkdir -p $outdir - awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk - cp -pr $outdir/utt2spk $outdir/spk2utt - awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments - awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text -done - - -grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp - -cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} - {if (seen[$1]) { print $0}}' > $dir/train/wav.scp - -echo data prep split succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh b/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh deleted file mode 100755 index 04529d88ac0..00000000000 --- a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -galeData=$(utils/make_absolute.sh "${@: -1}" ); # last argumnet; the local folder -txt_dvds=${@:1:${#}-1} # all the txt cds correspoding to the audio corpus; check text=( in ../run.sh - - -top_pwd=`pwd` -txtdir=$galeData/txt -mkdir -p $txtdir; cd $txtdir - -for cdx in $txt_dvds; do - echo "Preparing $cdx" - if [[ $cdx == *.tgz ]] ; then - tar -xvf $cdx - elif [ -d "$cdx" ]; then - ln -s $cdx `basename $cdx` - else - echo "I don't really know what I shall do with $cdx " >&2 - fi -done - -find -L . -type f -name "*.tdf" | while read file; do -sed '1,3d' $file # delete the first 3 lines -done > all.tmp$$ - -perl -e ' - ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0]; - open(IN, "$inFile"); - open(ID, ">$idFile"); - open(TXT, ">$txtFile"); - while () { - @arr= split /\t/,$_; - $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning - $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; - if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";} - $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n"; - next if ($rStart == $rEnd); - $id =~ s/.sph//g; - print ID $id; - print TXT "$arr[7]\n"; - }' "all.tmp$$ allid.tmp$$ contentall.tmp$$" - - -perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$ - -paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$ - -awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all -awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/report -awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational - -cd ..; -rm -fr $txtdir -cd $top_pwd -echo data prep text succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_format_data.sh b/egs/gale_arabic/s5b/local/gale_format_data.sh deleted file mode 100755 index b69c34e68b9..00000000000 --- a/egs/gale_arabic/s5b/local/gale_format_data.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -if [ -f path.sh ]; then - . ./path.sh; else - echo "$0: missing path.sh"; exit 1; -fi - -for dir in test train; do - cp -pr data/local/$dir data/$dir -done - - -mkdir -p data/lang_test - -arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -r data/lang_test -cp -r data/lang data/lang_test - -gunzip -c "$arpa_lm" | \ - arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst - - -echo "$0: Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst - -## Check lexicon. -## just have a look and make sure it seems sane. -echo "$0: First few lines of lexicon FST:" -fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head - -echo "$0: Performing further checks" - -# Checking that G.fst is determinizable. -fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. - -# Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. - -# Checking that disambiguated lexicon times G is determinizable -# Note: we do this with fstdeterminizestar not fstdeterminize, as -# fstdeterminize was taking forever (presumbaly relates to a bug -# in this version of OpenFst that makes determinization slow for -# some case). -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ - fstdeterminizestar >/dev/null || echo Error - -# Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ - fstisstochastic || echo LG is not stochastic - - -echo gale_format_data succeeded. - -exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_train_lms.sh b/egs/gale_arabic/s5b/local/gale_train_lms.sh deleted file mode 100755 index 3988ec3818f..00000000000 --- a/egs/gale_arabic/s5b/local/gale_train_lms.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - - -# To be run from one directory above this script. - - -lexicon=data/local/dict/lexicon.txt -[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1; - - -# This script takes no arguments. It assumes you have already run -# previus steps successfully -# It takes as input the files -#data/local/train.*/text -#data/local/dict/lexicon.txt - - -export LC_ALL=C # You'll get errors about things being not sorted, if you -# have a different locale. -export PATH=$PATH:./../../../tools/kaldi_lm -( # First make sure the kaldi_lm toolkit is installed. - cd $KALDI_ROOT/tools || exit 1; - if [ -d kaldi_lm ]; then - echo Not installing the kaldi_lm toolkit since it is already there. - else - echo Downloading and installing the kaldi_lm tools - if [ ! -f kaldi_lm.tar.gz ]; then - wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; - fi - tar -xvzf kaldi_lm.tar.gz || exit 1; - cd kaldi_lm - make || exit 1; - echo Done making the kaldi_lm tools - fi -) || exit 1; - - -dir=data/local/lm - mkdir -p $dir - text=data/local/train/text - [ ! -f $text ] && echo "$0: No such file $text" && exit 1; - - cleantext=$dir/text.no_oov - - cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ",$n);} } printf("\n");}' \ - > $cleantext || exit 1; - - - cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). - cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -# note: we probably won't really make use of as there aren't any OOVs - cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ - || exit 1; - -# note: ignore 1st field of train.txt, it's the utterance-id. - cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} - { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ - || exit 1; - - train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; - -# LM is small enough that we don't need to prune it (only about 0.7M N-grams). -# Perplexity over 128254.000000 words is 90.446690 - -# note: output is -# data/local/lm/3gram-mincount/lm_unpruned.gz - - -echo train lm succeeded - -exit 0 diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh index f14c8441869..5dc0818393b 100755 --- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh +++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh @@ -2,31 +2,29 @@ set -e -o pipefail -# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually -# be called by more scripts). It contains the common feature preparation and iVector-related parts -# of the script. See those scripts for examples of usage. +# This script is called from scripts like local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It +# contains the common feature preparation and iVector-related parts of the +# script. See those scripts for examples of usage. stage=0 nj=100 -min_seg_len=1.55 # min length in seconds... we do this because chain training - # will discard segments shorter than 1.5 seconds. Must remain in sync - # with the same option given to prepare_lores_feats_and_alignments.sh train_set=train # you might set this to e.g. train. -gmm=tri2b # This specifies a GMM-dir from the features of the type you're training the system on; +test_sets="test" +gmm=tri2b # This specifies a GMM-dir from the features of the type you're training the system on; # it should contain alignments for 'train_set'. num_threads_ubm=32 -nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it - # becomes exp/nnet3_cleaned or whatever. +nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff . ./cmd.sh . ./path.sh -. ./utils/parse_options.sh +. utils/parse_options.sh gmm_dir=exp/${gmm} -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do if [ ! -f $f ]; then @@ -61,7 +59,7 @@ if [ $stage -le 2 ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi - for datadir in ${train_set}_sp test; do + for datadir in ${train_set}_sp ${test_sets}; do utils/copy_data_dir.sh data/$datadir data/${datadir}_hires done @@ -69,7 +67,7 @@ if [ $stage -le 2 ]; then # features; this helps make trained nnets more invariant to test data volume. utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires - for datadir in ${train_set}_sp test; do + for datadir in ${train_set}_sp ${test_sets}; do steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/${datadir}_hires steps/compute_cmvn_stats.sh data/${datadir}_hires @@ -78,75 +76,33 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then - echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" - # we have to combine short segments or we won't be able to train chain models - # on those segments. - utils/data/combine_short_segments.sh \ - data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/ - utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/ -fi - -if [ $stage -le 4 ]; then - echo "$0: selecting segments of hires training data that were also present in the" - echo " ... original training data." - - # note, these data-dirs are temporary; we put them in a sub-directory - # of the place where we'll make the alignments. - temp_data_root=exp/nnet3${nnet3_affix}/tri5 - mkdir -p $temp_data_root - - utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \ - data/${train_set}_sp_hires $temp_data_root/${train_set}_hires - - # note: essentially all the original segments should be in the hires data. - n1=$(wc -l /dev/null +if [[ $? != 0 ]]; then + echo "sox is not installed"; exit 1 +fi + +for dvd in $dir1 $dir2 $dir3; do + dvd_full_path=$(utils/make_absolute.sh $dvd) + if [[ ! -e $dvd_full_path ]]; then + echo missing $dvd_full_path; exit 1; + fi + find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do + id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}') + echo "$id sox $file -r 16000 -t wav - |" + done +done | sort -u > $gale_data/wav.scp +echo data prep audio succeded + +gale_data=$(utils/make_absolute.sh "GALE" ); +top_pwd=`pwd` +txtdir=$gale_data/txt +mkdir -p $txtdir; cd $txtdir + +for cdx in $text1 $text2 $text3; do + echo "Preparing $cdx" + if [[ $cdx == *.tgz ]] ; then + tar -xvf $cdx + elif [ -d "$cdx" ]; then + ln -s $cdx `basename $cdx` + else + echo "I don't really know what I shall do with $cdx " >&2 + fi +done + +find -L . -type f -name "*.tdf" | while read file; do +sed '1,3d' $file # delete the first 3 lines +done > all.tmp$$ + +perl -e ' + ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0]; + open(IN, "$inFile"); + open(ID, ">$idFile"); + open(TXT, ">$txtFile"); + while () { + @arr= split /\t/,$_; + $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning + $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; + if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";} + $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n"; + next if ($rStart == $rEnd); + $id =~ s/.sph//g; + print ID $id; + print TXT "$arr[7]\n"; + }' "all.tmp$$ allid.tmp$$ contentall.tmp$$" + +perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$ +paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}' > all_1.tmp$$ + + +awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all +awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/report +awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational + +cd ..; +rm -fr $txtdir +cd $top_pwd +echo data prep text succeeded + +mkdir -p data +dir=$(utils/make_absolute.sh data/) +grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test +grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train + +for x in test train; do + outdir=data/$x + file=$gale_data/all.$x + mkdir -p $outdir + awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk + cp -pr $outdir/utt2spk $outdir/spk2utt + awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments + awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text +done + +grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp + +cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} + {if (seen[$1]) { print $0}}' > $dir/train/wav.scp + +echo data prep split succeeded + +exit 0 diff --git a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh similarity index 61% rename from egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh rename to egs/gale_arabic/s5b/local/prepare_dict.sh index 5f101f8245b..abaf8177f77 100755 --- a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh +++ b/egs/gale_arabic/s5b/local/prepare_dict.sh @@ -3,39 +3,31 @@ # Copyright 2017 QCRI (author: Ahmed Ali) # Apache 2.0 - -# run this from ../ +mkdir -p data/local/dict dir=$(utils/make_absolute.sh data/local/dict) -mkdir -p $dir - -# (1) Get all avaialble dictionaries, since this is a grapheme model, so we mainly need the most frequent word lists wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2 || exit 1; wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2 || exit 1; bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > tmp$$ bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> tmp$$ -# (2) Now we add all the words appeared in the training data -cat data/local/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$ + +cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$ grep -v [0-9] tmp$$ | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and rare alef wasla cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's: : :g' -e 's: : :g' -e 's:\s*: :g' -e 's:\*:V:g' > tmp2.$$ paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt -#(2) Dictionary preparation: +sed -i '1i SIL' $dir/lexicon.txt -# silence phones, one per line. echo SIL > $dir/silence_phones.txt -echo SIL > $dir/optional_silence.txt -# nonsilence phones; on each line is a list of phones that correspond -# really to the same base phone. -cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$ | sort -u > $dir/nonsilence_phones.txt || exit 1; +echo SIL >$dir/optional_silence.txt -sed -i '1i SIL' $dir/lexicon.txt # insert word with phone sil at the begining of the dictionary +echo -n "" >$dir/extra_questions.txt + +cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$ | sort -u > $dir/nonsilence_phones.txt || exit 1; rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ -echo Dictionary preparation succeeded -# The script is still missing dates and numbers +echo Dictionary preparation succeeded exit 0 - diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh new file mode 100755 index 00000000000..571ae1200df --- /dev/null +++ b/egs/gale_arabic/s5b/local/prepare_lm.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2012 Vassil Panayotov +# 2017 Ewald Enzinger +# Apache 2.0 + +. ./path.sh || exit 1 + +echo "=== Building a language model ..." + +locdata=data/local/lm/ +mkdir -p $locdata + +# Language model order +order=3 + +. utils/parse_options.sh + +# Prepare a LM training corpus from the transcripts +mkdir -p $locdata + +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 + else + sdir=$KALDI_ROOT/tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + +cat data/train/text | cut -d " " -f 2- > $locdata/train.txt + +ngram-count -text $locdata/train.txt -order $order -interpolate \ + -kndiscount -lm $locdata/lm.gz + +#ngram -lm $locdata/lm.gz -ppl $locdata/dev.txt +echo "*** Finished building the LM model!" diff --git a/egs/gale_arabic/s5b/local/score.sh b/egs/gale_arabic/s5b/local/score.sh index 83366f7c7fc..1d84815fc69 100755 --- a/egs/gale_arabic/s5b/local/score.sh +++ b/egs/gale_arabic/s5b/local/score.sh @@ -1,60 +1,6 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -[ -f ./path.sh ] && . ./path.sh - -# begin configuration section. -cmd=run.pl -stage=0 -decode_mbr=true -word_ins_penalty=0.0 -min_lmwt=7 -max_lmwt=17 -iter= #some of the scripts from steps/ seem to use it -#end configuration section. - -echo "$0 $#" - -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --stage (0|1|2) # start scoring script from part-way through." - echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." - echo " --min_lmwt # minumum LM-weight for lattice rescoring " - echo " --max_lmwt # maximum LM-weight for lattice rescoring " - exit 1; -fi -data=$1 -lang_or_graph=$2 -dir=$3 - -symtab=$lang_or_graph/words.txt - -for f in $symtab $dir/lat.1.gz $data/text; do - [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; -done - -mkdir -p $dir/scoring/log - -cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt - -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \ - lattice-best-path --word-symbol-table=$symtab \ - ark:- ark,t:$dir/scoring/LMWT.tra || exit 1; +#!/bin/bash -# Note: the double level of quoting for the sed command -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ - cat $dir/scoring/LMWT.tra \| \ - utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ - compute-wer --text --mode=present \ - ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; -exit 0; +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh index c45f5119949..bbb6349fea8 100755 --- a/egs/gale_arabic/s5b/run.sh +++ b/egs/gale_arabic/s5b/run.sh @@ -3,177 +3,110 @@ # Copyright 2014 QCRI (author: Ahmed Ali) # Apache 2.0 -. ./path.sh -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. num_jobs=120 num_decode_jobs=40 +decode_gmm=false +stage=0 +overwrite=false -#NB: You can add whatever number of copora you like. The supported extensions -#NB: (formats) are wav and flac. Flac will be converted using sox and in contrast -#NB: with the old approach, the conversion will be on-the-fly and one-time-only -#NB: during the parametrization. - -#NB: Text corpora scpecification. We support either tgz files, which are unpacked -#NB: or just plain (already unpacked) directories. The list of transcript is then -#NB: obtained using find command - -#Make sure you edit this section to reflect whers you keep the LDC data on your cluster - -#This is CLSP configuration. We add the 2014 GALE data. We got around 2 % -#improvement just by including it. The gain might be large if someone would tweak -# the number of leaves and states and so on. - -#audio=( -# /export/corpora/LDC/LDC2013S02/ -# /export/corpora/LDC/LDC2013S07/ -# /export/corpora/LDC/LDC2014S07/ -#) -#text=( -# /export/corpora/LDC/LDC2013T17 -# /export/corpora/LDC/LDC2013T04 -# /export/corpora/LDC/LDC2014T17 -#) - -audio=( - /data/sls/scratch/amali/data/GALE/LDC2013S02 - /data/sls/scratch/amali/data/GALE/LDC2013S07 - /data/sls/scratch/amali/data/GALE/LDC2014S07 -) -text=( - /data/sls/scratch/amali/data/GALE/LDC2013T17.tgz - /data/sls/scratch/amali/data/GALE/LDC2013T04.tgz - /data/sls/scratch/amali/data/GALE/LDC2014T17.tgz -) +dir1=/export/corpora/LDC/LDC2013S02/ +dir2=/export/corpora/LDC/LDC2013S07/ +dir3=/export/corpora/LDC/LDC2014S07/ +text1=/export/corpora/LDC/LDC2013T17/ +text2=/export/corpora/LDC/LDC2013T04/ +text3=/export/corpora/LDC/LDC2014T17/ galeData=GALE -#prepare the data -#split train dev test -#prepare lexicon and LM - -# You can run the script from here automatically, but it is recommended to run the data preparation, -# and features extraction manually and and only once. -# By copying and pasting into your shell. - -#copy the audio files to local folder wav and convet flac files to wav -local/gale_data_prep_audio.sh "${audio[@]}" $galeData || exit 1; - -#get the transcription and remove empty prompts and all noise markers -local/gale_data_prep_txt.sh "${text[@]}" $galeData || exit 1; +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. -# split the data to reports and conversational and for each class will have rain/dev and test -local/gale_data_prep_split.sh $galeData || exit 1; +if [ $stage -le 0 ]; then -# get all Arabic grapheme dictionaries and add silence and UNK -local/gale_prep_grapheme_dict.sh || exit 1; + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Preparing data..." + local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \ + --text1 $text1 --text2 $text2 --text3 $text3 + + local/prepare_dict.sh -#prepare the langauge resources -utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang || exit 1; + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang -# LM training -local/gale_train_lms.sh || exit 1; + local/prepare_lm.sh -local/gale_format_data.sh || exit 1; -# G compilation, check LG composition + utils/format_lm.sh data/lang data/local/lm/lm.gz \ + data/local/dict/lexicon.txt data/lang +fi # Now make MFCC features. # mfccdir should be some place with a largish disk where you # want to store MFCC features. mfccdir=mfcc - -for x in train test ; do - steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ - data/$x exp/make_mfcc/$x $mfccdir - utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir -done - +if [ $stage -le 1 ]; then + echo "$0: Preparing the test and train feature files..." + for x in train test ; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ + data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + done +fi # Here we start the AM - -# Let's create a subset with 10k segments to make quick flat-start training: -utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; - -# Train monophone models on a subset of the data, 10K segment -# Note: the --boost-silence option should probably be omitted by default -steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ - data/train.10K data/lang exp/mono || exit 1; - - -# Get alignments from monophone system. -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/mono exp/mono_ali || exit 1; - -# train tri1 [first triphone pass] -steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; - -# First triphone decoding -utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri1/graph data/test exp/tri1/decode - -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/tri1 exp/tri1_ali || exit 1; - -# Train tri2a, which is deltas+delta+deltas -steps/train_deltas.sh --cmd "$train_cmd" \ - 3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1; - -# tri2a decoding -utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri2a/graph data/test exp/tri2a/decode - -# train and decode tri2b [LDA+MLLT] -steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ - data/train data/lang exp/tri1_ali exp/tri2b || exit 1; - -utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph -steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ - exp/tri2b/graph data/test exp/tri2b/decode - -# Align all data with LDA+MLLT system (tri2b) -steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; - - -# From 2b system, train 3b which is LDA + MLLT + SAT. -steps/train_sat.sh --cmd "$train_cmd" \ - 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; - -utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph -steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \ - "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode - -# From 3b system, align all data. -steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ - data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; +if [ $stage -le 2 ]; then + # Let's create a subset with 10k segments to make quick flat-start training: + utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; + + # Train monophone models on a subset of the data, 10K segment + # Note: the --boost-silence option should probably be omitted by default + steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ + data/train.10K data/lang exp/mono || exit 1; +fi + +if [ $stage -le 3 ]; then + # Get alignments from monophone system. + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/mono exp/mono_ali || exit 1; - -# nnet3 cross-entropy -local/nnet3/run_tdnn.sh #tdnn recipe: -local/nnet3/run_lstm.sh --stage 12 #lstm recipe (we skip ivector training) - -# chain lattice-free -local/chain/run_tdnn.sh #tdnn recipe: -local/chain/run_tdnn_lstm.sh #tdnn-lstm recipe: - -time=$(date +"%Y-%m-%d-%H-%M-%S") - -#get detailed WER; reports, conversational and combined -local/split_wer.sh $galeData > RESULTS.details.$USER.$time # to make sure you keep the results timed and owned - + # train tri1 [first triphone pass] + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; +fi + +if [ $stage -le 4 ] && $decode_gmm; then + # First triphone decoding + utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph + steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ + exp/tri1/graph data/test exp/tri1/decode +fi + +if [ $stage -le 5 ]; then + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + + # train and decode tri2b [LDA+MLLT] + steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ + data/train data/lang exp/tri1_ali exp/tri2b || exit 1; +fi + +if [ $stage -le 6 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph + steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b/decode +fi + +if [ $stage -le 7 ]; then + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; + + local/chain/run_tdnn.sh #tdnn recipe: +fi echo training succedded exit 0 - -#TODO: -#LM (4-gram and RNN) rescoring -#combine lattices -#dialect detection - - - - - From 7dcc6090ac383add2b3bc4bca667eee67fea3f73 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Dec 2018 20:30:48 -0500 Subject: [PATCH 02/13] minor update --- egs/gale_arabic/s5b/local/chain/run_tdnn.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/gale_arabic/s5b/local/chain/run_tdnn.sh b/egs/gale_arabic/s5b/local/chain/run_tdnn.sh index 34499362831..61f8f499182 120000 --- a/egs/gale_arabic/s5b/local/chain/run_tdnn.sh +++ b/egs/gale_arabic/s5b/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1a.sh \ No newline at end of file +tuning/run_tdnn_1b.sh \ No newline at end of file From d75efb25f2ca4fe26dbe2644a0584e7876b3894a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 26 Dec 2018 14:55:00 -0500 Subject: [PATCH 03/13] fixing script name --- egs/gale_arabic/s5b/local/chain/{copare_wer.sh => compare_wer.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename egs/gale_arabic/s5b/local/chain/{copare_wer.sh => compare_wer.sh} (100%) diff --git a/egs/gale_arabic/s5b/local/chain/copare_wer.sh b/egs/gale_arabic/s5b/local/chain/compare_wer.sh similarity index 100% rename from egs/gale_arabic/s5b/local/chain/copare_wer.sh rename to egs/gale_arabic/s5b/local/chain/compare_wer.sh From bee4ba275544a12cc72d4690825d202ae2f2afb4 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sun, 30 Dec 2018 20:49:50 -0500 Subject: [PATCH 04/13] fixing bug --- egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh index caa0d9d805e..9e76130e7bd 100755 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh @@ -100,7 +100,7 @@ if $run_chain_common; then --lores-train-data-dir ${lores_train_data_dir} \ --lang $lang \ --lat-dir $lat_dir \ - --num-leaves 7000 \ + --num-leaves 3500 \ --tree-dir $tree_dir || exit 1; fi From 715d4def57ec68e69ea615d9711b965021ed9b45 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sun, 30 Dec 2018 20:58:09 -0500 Subject: [PATCH 05/13] modification from review, todo: run tri3 --- egs/gale_arabic/s5b/local/chain/run_tdnn.sh | 2 +- .../s5b/local/chain/tuning/run_tdnn_1a.sh | 288 +++++++++--------- .../s5b/local/chain/tuning/run_tdnn_1b.sh | 220 ------------- egs/gale_arabic/s5b/run.sh | 27 +- 4 files changed, 164 insertions(+), 373 deletions(-) delete mode 100755 egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh diff --git a/egs/gale_arabic/s5b/local/chain/run_tdnn.sh b/egs/gale_arabic/s5b/local/chain/run_tdnn.sh index 61f8f499182..34499362831 120000 --- a/egs/gale_arabic/s5b/local/chain/run_tdnn.sh +++ b/egs/gale_arabic/s5b/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1b.sh \ No newline at end of file +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh index 7afafb31ff6..b5486decc31 100755 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh @@ -1,31 +1,52 @@ #!/bin/bash -#started from tedlium recipe with few edits +# ./local/chain/compare_wer.sh exp/chain/tdnn_1b/ +# System tdnn_1b +# WER 17.23 +# CER 6.83 +# Final train prob -0.0825 +# Final valid prob -0.0987 +# Final train prob (xent) -0.6611 +# Final valid prob (xent) -0.7393 +# head exp/chain/tdnn_1b/decode_test_rnnlm_1e_2_0.40/scoring_kaldi/best_wer +# WER 16.58 [ 11549 / 69668, 1290 ins, 2389 del, 7870 sub ] exp/chain/tdnn1c_swbd_sp/decode_test_rnnlm_1e_2_0.40/wer_10_0.5 -set -e -o pipefail +# exp/chain/tdnn_1b/: num-iters=441 nj=3..16 num-params=16.5M dim=40+100->1792 combine=-0.081->-0.081 (over 6) xent:train/valid[293,440,final]=(-0.937,-0.659,-0.661/-0.960,-0.739,-0.739) logprob:train/valid[293,440,final]=(-0.124,-0.083,-0.083/-0.127,-0.100,-0.099) -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). +set -e -o pipefail stage=0 nj=30 -decode_nj=30 -min_seg_len=1.55 -xent_regularize=0.1 train_set=train -gmm=tri2b # the gmm for the target data +test_set=test +gmm=tri2b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 #default -10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix=1b #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir= # you can set this to use previously dumped egs. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=150,110,100 +get_egs_stage=-10 + +# training options +srand=0 +remove_egs=true +run_ivector_common=true +run_chain_common=true # End configuration section. echo "$0 $@" # Print the command line for logging + . ./cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -39,169 +60,162 @@ where "nvcc" is installed. EOF fi -local/nnet3/run_ivector_common.sh --stage $stage \ - --nj $nj \ - --min-seg-len $min_seg_len \ - --train-set $train_set \ - --gmm $gmm \ - --num-threads-ubm $num_threads_ubm \ - --nnet3-affix "$nnet3_affix" - - -gmm_dir=exp/$gmm -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb -tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix} -lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats -dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi -train_data_dir=data/${train_set}_sp_hires_comb -lores_train_data_dir=data/${train_set}_sp_comb -train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb - +if $run_ivector_common; then + local/nnet3/run_ivector_common.sh \ + --stage $stage --nj $nj \ + --train-set $train_set --gmm $gmm \ + --num-threads-ubm $num_threads_ubm \ + --nnet3-affix "$nnet3_affix" +fi -for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ - $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats +dir=exp/chain${nnet3_affix}/tdnn${affix}_sp +train_data_dir=data/${train_set}_sp_hires +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp + +# note: you don't necessarily have to change the treedir name +# each time you do a new experiment-- only if you change the +# configuration in a way that affects the tree. +tree_dir=exp/chain${nnet3_affix}/tree_a_sp +# the 'lang' directory is created by this script. +# If you create such a directory with a non-standard topology +# you should probably name it differently. +lang=data/lang_chain + +for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ + $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \ + $ali_dir/ali.1.gz $gmm_dir/final.mdl; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 done -if [ $stage -le 14 ]; then - echo "$0: creating lang directory with one state per phone." - # Create a version of the lang/ directory that has one state per phone in the - # topo file. [note, it really has two states.. the first one is only repeated - # once, the second one has zero or more repeats.] - if [ -d data/lang_chain ]; then - if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then - echo "$0: data/lang_chain already exists, not overwriting it; continuing" - else - echo "$0: data/lang_chain already exists and seems to be older than data/lang..." - echo " ... not sure what to do. Exiting." - exit 1; - fi - else - cp -r data/lang data/lang_chain - silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1; - nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1; - # Use our special topology... note that later on may have to tune this - # topology. - steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo - fi +# Please take this as a reference on how to specify all the options of +# local/chain/run_chain_common.sh +if $run_chain_common; then + local/chain/run_chain_common.sh --stage $stage \ + --gmm-dir $gmm_dir \ + --ali-dir $ali_dir \ + --lores-train-data-dir ${lores_train_data_dir} \ + --lang $lang \ + --lat-dir $lat_dir \ + --num-leaves 3500 \ + --tree-dir $tree_dir || exit 1; fi if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - -if [ $stage -le 17 ]; then mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" mkdir -p $dir/configs + cat < $dir/configs/network.xconfig input dim=100 name=ivector input dim=40 name=input - # please note that it is important to have input layer with the name=input # as the layer immediately preceding the fixed-affine-layer to enable # the use of short notation for the descriptor fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=450 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450 - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450 - - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ - fi -if [ $stage -le 18 ]; then + +if [ $stage -le 16 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi - steps/nnet3/chain/train.py --stage $train_stage \ + steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ --feat.online-ivector-dir $train_ivector_dir \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ + --chain.l2-regularize 0.0 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 6 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 2 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.stage $get_egs_stage \ + --reporting.email="$reporting_email" \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir -fi - + --lat-dir=$lat_dir \ + --dir $dir || exit 1; +fi -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang \ + $tree_dir $tree_dir/graph || exit 1; fi -if [ $stage -le 20 ]; then +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) rm $dir/.error 2>/dev/null || true - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/test_hires $dir/decode || exit 1; + + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 0 --extra-right-context 0 \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \ + $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1 fi -exit 0 diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh deleted file mode 100755 index 9e76130e7bd..00000000000 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh +++ /dev/null @@ -1,220 +0,0 @@ -#!/bin/bash - -# ./local/chain/compare_wer.sh exp/chain/tdnn_1b/ -# System tdnn_1b -# WER 17.23 -# CER 6.83 -# Final train prob -0.0825 -# Final valid prob -0.0987 -# Final train prob (xent) -0.6611 -# Final valid prob (xent) -0.7393 - -# head exp/chain/tdnn_1b/decode_test_rnnlm_1e_2_0.40/scoring_kaldi/best_wer -# WER 16.58 [ 11549 / 69668, 1290 ins, 2389 del, 7870 sub ] exp/chain/tdnn1c_swbd_sp/decode_test_rnnlm_1e_2_0.40/wer_10_0.5 - -# exp/chain/tdnn_1b/: num-iters=441 nj=3..16 num-params=16.5M dim=40+100->1792 combine=-0.081->-0.081 (over 6) xent:train/valid[293,440,final]=(-0.937,-0.659,-0.661/-0.960,-0.739,-0.739) logprob:train/valid[293,440,final]=(-0.124,-0.083,-0.083/-0.127,-0.100,-0.099) - -set -e -o pipefail -stage=0 -nj=30 -train_set=train -test_set=test -gmm=tri2b # this is the source gmm-dir that we'll use for alignments; it - # should have alignments for the specified training data. -num_threads_ubm=32 -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. - -# Options which are not passed through to run_ivector_common.sh -affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -common_egs_dir= -reporting_email= - -# LSTM/chain options -train_stage=-10 -xent_regularize=0.1 -dropout_schedule='0,0@0.20,0.5@0.50,0' - -# training chunk-options -chunk_width=150,110,100 -get_egs_stage=-10 - -# training options -srand=0 -remove_egs=true -run_ivector_common=true -run_chain_common=true -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 - tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 - tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 - tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 - tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 - tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - linear-component name=prefinal-l dim=256 $linear_opts - prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 - output-layer name=output include-log-softmax=false dim=$num_targets $output_opts - prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 16 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.0 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.dropout-schedule $dropout_schedule \ - --trainer.srand=$srand \ - --trainer.max-param-change=2.0 \ - --trainer.num-epochs 6 \ - --trainer.frames-per-iter 1500000 \ - --trainer.optimization.num-jobs-initial 3 \ - --trainer.optimization.num-jobs-final 16 \ - --trainer.optimization.initial-effective-lrate 0.00025 \ - --trainer.optimization.final-effective-lrate 0.000025 \ - --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.add-option="--optimization.memory-compression-level=2" \ - --egs.chunk-width=$chunk_width \ - --egs.dir="$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ - --egs.stage $get_egs_stage \ - --cleanup.remove-egs=$remove_egs \ - --feat-dir=$train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir=$lat_dir \ - --dir $dir || exit 1; - -fi - -if [ $stage -le 17 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/lang/check_phones_compatible.sh \ - data/lang_test/phones.txt $lang/phones.txt - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang \ - $tree_dir $tree_dir/graph || exit 1; -fi - -if [ $stage -le 18 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - rm $dir/.error 2>/dev/null || true - - steps/nnet3/decode.sh \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context 0 --extra-right-context 0 \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$decode_cmd" --num-threads 4 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \ - $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1 -fi diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh index bbb6349fea8..b2c10ec0a94 100755 --- a/egs/gale_arabic/s5b/run.sh +++ b/egs/gale_arabic/s5b/run.sh @@ -34,7 +34,8 @@ if [ $stage -le 0 ]; then echo "$0: Preparing data..." local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \ --text1 $text1 --text2 $text2 --text3 $text3 - + + echo "$0: Preparing lexicon and LM..." local/prepare_dict.sh utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang @@ -45,9 +46,6 @@ if [ $stage -le 0 ]; then data/local/dict/lexicon.txt data/lang fi -# Now make MFCC features. -# mfccdir should be some place with a largish disk where you -# want to store MFCC features. mfccdir=mfcc if [ $stage -le 1 ]; then echo "$0: Preparing the test and train feature files..." @@ -59,41 +57,41 @@ if [ $stage -le 1 ]; then done fi -# Here we start the AM if [ $stage -le 2 ]; then - # Let's create a subset with 10k segments to make quick flat-start training: + echo "$0: creating sub-set and training monophone system" utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; - # Train monophone models on a subset of the data, 10K segment - # Note: the --boost-silence option should probably be omitted by default steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ data/train.10K data/lang exp/mono || exit 1; fi if [ $stage -le 3 ]; then - # Get alignments from monophone system. + echo "$0: Aligning data using monophone system" steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/mono exp/mono_ali || exit 1; - - # train tri1 [first triphone pass] + + echo "$0: training triphone system with delta features" steps/train_deltas.sh --cmd "$train_cmd" \ 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; fi if [ $stage -le 4 ] && $decode_gmm; then - # First triphone decoding utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ exp/tri1/graph data/test exp/tri1/decode fi if [ $stage -le 5 ]; then + echo "$0: Aligning data and retraining and realigning with lda_mllt" steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/tri1 exp/tri1_ali || exit 1; # train and decode tri2b [LDA+MLLT] steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ data/train data/lang exp/tri1_ali exp/tri2b || exit 1; + + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; fi if [ $stage -le 6 ] && $decode_gmm; then @@ -103,10 +101,9 @@ if [ $stage -le 6 ] && $decode_gmm; then fi if [ $stage -le 7 ]; then - steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; - + echo "$0: Training a regular chain model using the e2e alignments..." local/chain/run_tdnn.sh #tdnn recipe: fi + echo training succedded exit 0 From 9c16689e98b9425e50004abf9045969eab10b00f Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 1 Jan 2019 22:19:46 -0500 Subject: [PATCH 06/13] modification from review --- egs/gale_arabic/s5b/local/prepare_dict.sh | 51 ++++++++++++-------- egs/gale_arabic/s5b/local/prepare_lexicon.py | 26 ++++++++++ egs/gale_arabic/s5b/local/wer_output_filter | 19 ++++++++ 3 files changed, 77 insertions(+), 19 deletions(-) create mode 100755 egs/gale_arabic/s5b/local/prepare_lexicon.py create mode 100755 egs/gale_arabic/s5b/local/wer_output_filter diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh index abaf8177f77..5feef445357 100755 --- a/egs/gale_arabic/s5b/local/prepare_dict.sh +++ b/egs/gale_arabic/s5b/local/prepare_dict.sh @@ -1,33 +1,46 @@ -#!/bin/bash +#!/usr/bin/env bash # Copyright 2017 QCRI (author: Ahmed Ali) # Apache 2.0 +# This script prepares the dictionary. -mkdir -p data/local/dict -dir=$(utils/make_absolute.sh data/local/dict) +set -e +dir=data/local/dict +lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2"; +lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2"; +stage=0 +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; +mkdir -p $dir data/local/lexicon_data -wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2 || exit 1; -wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2 || exit 1; -bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > tmp$$ -bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> tmp$$ +if [ $stage -le 0 ]; then + echo "$0: Downloading text for lexicon... $(date)." + wget -P data/local/lexicon_data $lexicon_url1 + wget -P data/local/lexicon_data $lexicon_url2 + bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2 | sed '1,3d' | awk '{print $1}' > data/local/lexicon_data/grapheme_lexicon + bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >> data/local/lexicon_data/grapheme_lexicon + cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon +fi -cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$ -grep -v [0-9] tmp$$ | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and rare alef wasla -cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's: : :g' -e 's: : :g' -e 's:\s*: :g' -e 's:\*:V:g' > tmp2.$$ -paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt -sed -i '1i SIL' $dir/lexicon.txt +if [ $stage -le 0 ]; then + echo "$0: processing lexicon text and creating lexicon... $(date)." + # remove vowels and rare alef wasla + grep -v [0-9] data/local/lexicon_data/grapheme_lexicon | sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon + local/prepare_lexicon.py +fi -echo SIL > $dir/silence_phones.txt +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; -echo SIL >$dir/optional_silence.txt +sed -i '1i UNK' $dir/lexicon.txt -echo -n "" >$dir/extra_questions.txt +echo ' SIL' >> $dir/lexicon.txt -cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$ | sort -u > $dir/nonsilence_phones.txt || exit 1; +echo SIL > $dir/silence_phones.txt -rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ +echo SIL >$dir/optional_silence.txt -echo Dictionary preparation succeeded +echo -n "" >$dir/extra_questions.txt -exit 0 +echo "$0: Dictionary preparation succeeded" diff --git a/egs/gale_arabic/s5b/local/prepare_lexicon.py b/egs/gale_arabic/s5b/local/prepare_lexicon.py new file mode 100755 index 00000000000..215541585eb --- /dev/null +++ b/egs/gale_arabic/s5b/local/prepare_lexicon.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 + +# This script prepares lexicon. + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon') +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + characters = list(line) + characters = " ".join(['V' if char == '*' else char for char in characters]) + lex[line] = characters + +with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter new file mode 100755 index 00000000000..ee5c8809ca7 --- /dev/null +++ b/egs/gale_arabic/s5b/local/wer_output_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 +# This script converts a BPE-encoded text to normal text. It is used in scoring + +import sys, io +import string + +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +for line in infile: + words = line.strip().split() + words = [word for word in words if '' not in word] + uttid = words[0] + transcript = ' '.join(words[1:]) + output.write(uttid + ' ' + transcript + '\n') From 741c4be023ab3a9627593cd5a11b28bed3fc3fbb Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 2 Jan 2019 00:18:20 -0500 Subject: [PATCH 07/13] fixing bug --- egs/gale_arabic/s5b/local/prepare_lm.sh | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh index 571ae1200df..0fe1b60c333 100755 --- a/egs/gale_arabic/s5b/local/prepare_lm.sh +++ b/egs/gale_arabic/s5b/local/prepare_lm.sh @@ -8,8 +8,9 @@ echo "=== Building a language model ..." -locdata=data/local/lm/ -mkdir -p $locdata +locdata dir=data/local/lm/ +text=data/local/train/text +lexicon=data/local/dict/lexicon.txt # Language model order order=3 @@ -17,7 +18,11 @@ order=3 . utils/parse_options.sh # Prepare a LM training corpus from the transcripts -mkdir -p $locdata +mkdir -p $dir + +for f in "$text" "$lexicon"; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done loc=`which ngram-count`; if [ -z $loc ]; then @@ -37,10 +42,11 @@ if [ -z $loc ]; then fi fi -cat data/train/text | cut -d " " -f 2- > $locdata/train.txt +cat data/train/text | cut -d " " -f 2- > $dir/train.txt +cut -d' ' -f1 $lexicon > $dir/wordlist -ngram-count -text $locdata/train.txt -order $order -interpolate \ - -kndiscount -lm $locdata/lm.gz +ngram-count -text $dir/train.txt -order $order -limit-vocab -vocab $dir/wordlist \ + -unk -map-unk "" -kndiscount -interpolate -lm $dir/lm.gz -#ngram -lm $locdata/lm.gz -ppl $locdata/dev.txt +#ngram -lm $dir/lm.gz -ppl $dir/dev.txt echo "*** Finished building the LM model!" From 8b3ff02ac96ef1515724632fefa933db47f3d9c5 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 2 Jan 2019 00:43:05 -0500 Subject: [PATCH 08/13] minor fix --- egs/gale_arabic/s5b/local/prepare_lm.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh index 0fe1b60c333..e28c7932e23 100755 --- a/egs/gale_arabic/s5b/local/prepare_lm.sh +++ b/egs/gale_arabic/s5b/local/prepare_lm.sh @@ -8,8 +8,8 @@ echo "=== Building a language model ..." -locdata dir=data/local/lm/ -text=data/local/train/text +dir=data/local/lm/ +text=data/train/text lexicon=data/local/dict/lexicon.txt # Language model order @@ -46,7 +46,7 @@ cat data/train/text | cut -d " " -f 2- > $dir/train.txt cut -d' ' -f1 $lexicon > $dir/wordlist ngram-count -text $dir/train.txt -order $order -limit-vocab -vocab $dir/wordlist \ - -unk -map-unk "" -kndiscount -interpolate -lm $dir/lm.gz + -unk -map-unk "" -kndiscount -interpolate -lm $dir/lm.gz #ngram -lm $dir/lm.gz -ppl $dir/dev.txt echo "*** Finished building the LM model!" From ea38cda4ba6f979e14a3fe0ed6c8c448672addc8 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 16:49:00 -0500 Subject: [PATCH 09/13] adding train sat basis, unk phone is dict, minor changes --- egs/gale_arabic/s5b/cmd.sh | 6 ++-- .../s5b/local/chain/tuning/run_tdnn_1a.sh | 4 +-- .../s5b/local/nnet3/run_ivector_common.sh | 4 +-- egs/gale_arabic/s5b/local/prepare_data.sh | 15 +++++----- egs/gale_arabic/s5b/local/prepare_dict.sh | 2 ++ egs/gale_arabic/s5b/local/prepare_lm.sh | 5 ++-- egs/gale_arabic/s5b/local/wer_output_filter | 2 +- egs/gale_arabic/s5b/run.sh | 30 ++++++++++++++----- 8 files changed, 41 insertions(+), 27 deletions(-) diff --git a/egs/gale_arabic/s5b/cmd.sh b/egs/gale_arabic/s5b/cmd.sh index 71dd849a93b..ea341c98d4a 100755 --- a/egs/gale_arabic/s5b/cmd.sh +++ b/egs/gale_arabic/s5b/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="queue.pl --mem 2G" -export decode_cmd="queue.pl --mem 4G" -export mkgraph_cmd="queue.pl --mem 8G" +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="retry.pl queue.pl --mem 4G" +export mkgraph_cmd="retry.pl queue.pl --mem 8G" diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh index b5486decc31..7d19b88fcc7 100755 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh @@ -19,7 +19,7 @@ stage=0 nj=30 train_set=train test_set=test -gmm=tri2b # this is the source gmm-dir that we'll use for alignments; it +gmm=tri3b # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. num_threads_ubm=32 nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. @@ -201,7 +201,7 @@ if [ $stage -le 17 ]; then utils/lang/check_phones_compatible.sh \ data/lang_test/phones.txt $lang/phones.txt utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang \ + --self-loop-scale 1.0 data/lang_test \ $tree_dir $tree_dir/graph || exit 1; fi diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh index 5dc0818393b..f071842dc0b 100755 --- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh +++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh @@ -12,7 +12,7 @@ stage=0 nj=100 train_set=train # you might set this to e.g. train. test_sets="test" -gmm=tri2b # This specifies a GMM-dir from the features of the type you're training the system on; +gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on; # it should contain alignments for 'train_set'. num_threads_ubm=32 @@ -146,7 +146,7 @@ if [ $stage -le 5 ]; then fi if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then - echo "$0: $feats already exists. Refusing to overwrite the features " + echo "$0: data/${train_set}_sp/feats.scp already exists. Refusing to overwrite the features " echo " to avoid wasting time. Please remove the file and continue if you really mean this." exit 1; fi diff --git a/egs/gale_arabic/s5b/local/prepare_data.sh b/egs/gale_arabic/s5b/local/prepare_data.sh index 1561928bb48..aea9ba2dc8e 100755 --- a/egs/gale_arabic/s5b/local/prepare_data.sh +++ b/egs/gale_arabic/s5b/local/prepare_data.sh @@ -15,20 +15,20 @@ mkdir -p $gale_data # check that sox is installed which sox &>/dev/null if [[ $? != 0 ]]; then - echo "sox is not installed"; exit 1 + echo "$0: sox is not installed"; exit 1 fi for dvd in $dir1 $dir2 $dir3; do dvd_full_path=$(utils/make_absolute.sh $dvd) if [[ ! -e $dvd_full_path ]]; then - echo missing $dvd_full_path; exit 1; + echo "$0: missing $dvd_full_path"; exit 1; fi find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \) | while read file; do id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}') echo "$id sox $file -r 16000 -t wav - |" done done | sort -u > $gale_data/wav.scp -echo data prep audio succeded +echo "$0:data prep audio succeded" gale_data=$(utils/make_absolute.sh "GALE" ); top_pwd=`pwd` @@ -36,13 +36,13 @@ txtdir=$gale_data/txt mkdir -p $txtdir; cd $txtdir for cdx in $text1 $text2 $text3; do - echo "Preparing $cdx" + echo "$0:Preparing $cdx" if [[ $cdx == *.tgz ]] ; then tar -xvf $cdx elif [ -d "$cdx" ]; then ln -s $cdx `basename $cdx` else - echo "I don't really know what I shall do with $cdx " >&2 + echo "$0:I don't really know what I shall do with $cdx " >&2 fi done @@ -78,7 +78,7 @@ awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' cd ..; rm -fr $txtdir cd $top_pwd -echo data prep text succeeded +echo "$0:dat a prep text succeeded" mkdir -p data dir=$(utils/make_absolute.sh data/) @@ -100,6 +100,5 @@ grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} {if (seen[$1]) { print $0}}' > $dir/train/wav.scp -echo data prep split succeeded - +echo "$0:data prep split succeeded" exit 0 diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh index 5feef445357..47b5869fdf1 100755 --- a/egs/gale_arabic/s5b/local/prepare_dict.sh +++ b/egs/gale_arabic/s5b/local/prepare_dict.sh @@ -35,6 +35,8 @@ cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed ' sed -i '1i UNK' $dir/lexicon.txt +echo UNK >> $dir/nonsilence_phones.txt + echo ' SIL' >> $dir/lexicon.txt echo SIL > $dir/silence_phones.txt diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh index e28c7932e23..6fdf35f471a 100755 --- a/egs/gale_arabic/s5b/local/prepare_lm.sh +++ b/egs/gale_arabic/s5b/local/prepare_lm.sh @@ -11,7 +11,6 @@ echo "=== Building a language model ..." dir=data/local/lm/ text=data/train/text lexicon=data/local/dict/lexicon.txt - # Language model order order=3 @@ -42,10 +41,10 @@ if [ -z $loc ]; then fi fi -cat data/train/text | cut -d " " -f 2- > $dir/train.txt +cat data/train/text | cut -d " " -f 2- > $dir/text.txt cut -d' ' -f1 $lexicon > $dir/wordlist -ngram-count -text $dir/train.txt -order $order -limit-vocab -vocab $dir/wordlist \ +ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \ -unk -map-unk "" -kndiscount -interpolate -lm $dir/lm.gz #ngram -lm $dir/lm.gz -ppl $dir/dev.txt diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter index ee5c8809ca7..cf48b434144 100755 --- a/egs/gale_arabic/s5b/local/wer_output_filter +++ b/egs/gale_arabic/s5b/local/wer_output_filter @@ -13,7 +13,7 @@ output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') for line in infile: words = line.strip().split() - words = [word for word in words if '' not in word] + words = [word for word in words if '' not in word] uttid = words[0] transcript = ' '.join(words[1:]) output.write(uttid + ' ' + transcript + '\n') diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh index b2c10ec0a94..3f12d22495e 100755 --- a/egs/gale_arabic/s5b/run.sh +++ b/egs/gale_arabic/s5b/run.sh @@ -5,7 +5,7 @@ num_jobs=120 num_decode_jobs=40 -decode_gmm=false +decode_gmm=true stage=0 overwrite=false @@ -43,7 +43,7 @@ if [ $stage -le 0 ]; then local/prepare_lm.sh utils/format_lm.sh data/lang data/local/lm/lm.gz \ - data/local/dict/lexicon.txt data/lang + data/local/dict/lexicon.txt data/lang_test fi mfccdir=mfcc @@ -86,12 +86,8 @@ if [ $stage -le 5 ]; then steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/tri1 exp/tri1_ali || exit 1; - # train and decode tri2b [LDA+MLLT] steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ data/train data/lang exp/tri1_ali exp/tri2b || exit 1; - - steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ - --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; fi if [ $stage -le 6 ] && $decode_gmm; then @@ -101,9 +97,27 @@ if [ $stage -le 6 ] && $decode_gmm; then fi if [ $stage -le 7 ]; then + echo "$0: Aligning data and retraining and realigning with sat_basis" + steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; + + steps/train_sat_basis.sh --cmd "$train_cmd" \ + 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; + + steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; +fi + +if [ $stage -le 8 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph + steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \ + "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode +fi + +if [ $stage -le 9 ]; then echo "$0: Training a regular chain model using the e2e alignments..." - local/chain/run_tdnn.sh #tdnn recipe: + local/chain/run_tdnn.sh fi -echo training succedded +echo "$0: training succedded" exit 0 From 8bc9325c31510ddeb89ea48840b2c0fc583ce304 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 18:28:21 -0500 Subject: [PATCH 10/13] updating results --- egs/gale_arabic/s5b/RESULTS | 5 ++++ .../s5b/local/chain/tuning/run_tdnn_1a.sh | 24 +++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS index 2260a106654..dec0287b33c 100644 --- a/egs/gale_arabic/s5b/RESULTS +++ b/egs/gale_arabic/s5b/RESULTS @@ -70,3 +70,8 @@ Combined Results for Reports and Conversational WER: %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16 %WER 39.42 [ 27462 / 69668, 2473 ins, 6051 del, 18938 sub ] exp/tri2a/decode/wer_14 %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13 + + +WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 (train_sat_basis) +current best 'chain' models (see local/chain/tuning/run_tdnn_1a.sh) +%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0 diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh index 7d19b88fcc7..dbdd7157cea 100755 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh @@ -1,18 +1,16 @@ #!/bin/bash -# ./local/chain/compare_wer.sh exp/chain/tdnn_1b/ -# System tdnn_1b -# WER 17.23 -# CER 6.83 -# Final train prob -0.0825 -# Final valid prob -0.0987 -# Final train prob (xent) -0.6611 -# Final valid prob (xent) -0.7393 - -# head exp/chain/tdnn_1b/decode_test_rnnlm_1e_2_0.40/scoring_kaldi/best_wer -# WER 16.58 [ 11549 / 69668, 1290 ins, 2389 del, 7870 sub ] exp/chain/tdnn1c_swbd_sp/decode_test_rnnlm_1e_2_0.40/wer_10_0.5 - -# exp/chain/tdnn_1b/: num-iters=441 nj=3..16 num-params=16.5M dim=40+100->1792 combine=-0.081->-0.081 (over 6) xent:train/valid[293,440,final]=(-0.937,-0.659,-0.661/-0.960,-0.739,-0.739) logprob:train/valid[293,440,final]=(-0.124,-0.083,-0.083/-0.127,-0.100,-0.099) +# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp +# System tdnn_1a_sp +# WER 16.66 +# CER 6.70 +# Final train prob -0.0674 +# Final valid prob -0.0832 +# Final train prob (xent) -0.8575 +# Final valid prob (xent) -0.9472 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/ +# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=17.2M dim=40+100->3024 combine=-0.064->-0.064 (over 5) xent:train/valid[293,440,final]=(-1.17,-0.868,-0.858/-1.24,-0.956,-0.947) logprob:train/valid[293,440,final]=(-0.102,-0.068,-0.067/-0.113,-0.084,-0.083) set -e -o pipefail stage=0 From fb90d785033e98785284fd1b092cbee7012eeb2a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 21 Jan 2019 13:33:42 -0500 Subject: [PATCH 11/13] updating results --- egs/gale_arabic/s5b/RESULTS | 20 +++++++++++++++++-- .../s5b/local/chain/tuning/run_tdnn_1a.sh | 17 ++++++++-------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS index dec0287b33c..9ae9ba81b7a 100644 --- a/egs/gale_arabic/s5b/RESULTS +++ b/egs/gale_arabic/s5b/RESULTS @@ -72,6 +72,22 @@ Combined Results for Reports and Conversational WER: %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13 -WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 (train_sat_basis) -current best 'chain' models (see local/chain/tuning/run_tdnn_1a.sh) +# Effect of GMM seed model (tri2b instead of tri3b). Using tri3b give a slightly better result +# as compared to using tri2b as seed. %WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0 +%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0 + +# WER with train_sat_basis +%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 + +# Effect of Tree-size (3500, 4500, 7000) +%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0 +%WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0 +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0 + +# Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002) +%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_05_sp/decode_test/wer_9_0.5 +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0 + +#current best 'chain' models (see local/chain/tuning/run_tdnn_1a.sh) +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0 diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh index dbdd7157cea..a3ccfda04ac 100755 --- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh @@ -2,15 +2,16 @@ # ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp # System tdnn_1a_sp -# WER 16.66 -# CER 6.70 -# Final train prob -0.0674 -# Final valid prob -0.0832 -# Final train prob (xent) -0.8575 -# Final valid prob (xent) -0.9472 +# WER 16.47 +# CER 6.68 +# Final train prob -0.0652 +# Final valid prob -0.0831 +# Final train prob (xent) -0.8965 +# Final valid prob (xent) -0.9964 # steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/ -# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=17.2M dim=40+100->3024 combine=-0.064->-0.064 (over 5) xent:train/valid[293,440,final]=(-1.17,-0.868,-0.858/-1.24,-0.956,-0.947) logprob:train/valid[293,440,final]=(-0.102,-0.068,-0.067/-0.113,-0.084,-0.083) +# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083) + set -e -o pipefail stage=0 @@ -98,7 +99,7 @@ if $run_chain_common; then --lores-train-data-dir ${lores_train_data_dir} \ --lang $lang \ --lat-dir $lat_dir \ - --num-leaves 3500 \ + --num-leaves 7000 \ --tree-dir $tree_dir || exit 1; fi From 1abadddcd3d0fe774d31c191a657b6dce3a02f04 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 21 Jan 2019 14:17:54 -0500 Subject: [PATCH 12/13] minor fix --- egs/gale_arabic/s5b/RESULTS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS index 9ae9ba81b7a..b0ff31615c7 100644 --- a/egs/gale_arabic/s5b/RESULTS +++ b/egs/gale_arabic/s5b/RESULTS @@ -80,10 +80,11 @@ Combined Results for Reports and Conversational WER: # WER with train_sat_basis %WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 -# Effect of Tree-size (3500, 4500, 7000) +# Effect of Tree-size (3500, 4500, 7000, 11000) %WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0 %WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0 %WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0 +%WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0 # Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002) %WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_05_sp/decode_test/wer_9_0.5 From 42aa3915922ad55bfaa109c580a771c86a7e8f83 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 22 Jan 2019 15:41:06 -0500 Subject: [PATCH 13/13] modification from the review --- egs/gale_arabic/s5b/RESULTS | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS index b0ff31615c7..e0fb9d38ceb 100644 --- a/egs/gale_arabic/s5b/RESULTS +++ b/egs/gale_arabic/s5b/RESULTS @@ -65,6 +65,9 @@ Combined Results for Reports and Conversational WER: %WER 32.36 [ 22542 / 69668, 2156 ins, 4184 del, 16202 sub ] exp/tri2b_mmi/decode_it4/wer_11 %WER 32.50 [ 22640 / 69668, 2393 ins, 3956 del, 16291 sub ] exp/tri2b_mmi/decode_it3/wer_11 %WER 32.79 [ 22847 / 69668, 2407 ins, 4760 del, 15680 sub ] exp/tri2b_mpe/decode_it3/wer_13 +# WER with train_sat_basis +%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 +# WER with train_sat %WER 33.61 [ 23413 / 69668, 2817 ins, 4577 del, 16019 sub ] exp/tri3b/decode/wer_17 %WER 35.73 [ 24894 / 69668, 2630 ins, 4944 del, 17320 sub ] exp/tri3b/decode.si/wer_15 %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16 @@ -74,11 +77,8 @@ Combined Results for Reports and Conversational WER: # Effect of GMM seed model (tri2b instead of tri3b). Using tri3b give a slightly better result # as compared to using tri2b as seed. -%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0 -%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0 - -# WER with train_sat_basis -%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 +%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3b_sp/decode_test/wer_10_0.0 +%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_2b_sp/decode_test/wer_9_0.0 # Effect of Tree-size (3500, 4500, 7000, 11000) %WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0 @@ -87,8 +87,8 @@ Combined Results for Reports and Conversational WER: %WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0 # Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002) -%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_05_sp/decode_test/wer_9_0.5 -%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0 +%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_005_sp/decode_test/wer_9_0.5 +%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_002_sp/decode_test/wer_9_0.0 #current best 'chain' models (see local/chain/tuning/run_tdnn_1a.sh) %WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0