diff --git a/egs/zeroth_korean/s5/README.txt b/egs/zeroth_korean/s5/README.txt new file mode 100644 index 00000000000..daa007362d8 --- /dev/null +++ b/egs/zeroth_korean/s5/README.txt @@ -0,0 +1,13 @@ +Zeroth-Korean kaldi example is from Zeroth Project. Zeroth project introduces free Korean speech corpus and aims to make Korean speech recognition more broadly accessible to everyone. This project was developed in collaboration between Lucas Jo(@Atlas Guide Inc.) and Wonkyum Lee(@Gridspace Inc.). + +In this example, we are using 51.6 hours transcribed Korean audio for training data (22,263 utterances, 105 people, 3000 sentences) and 1.2 hours transcribed Korean audio for testing data (457 utterances, 10 people). Besides audio and transcription, we provide pre-trained/designed language model, lexicon and morpheme-based segmenter(morfessor) + +The database can be also downloaded from openslr: +http://www.openslr.org/40 + +The database is licensed under Attribution 4.0 International (CC BY 4.0) + +This folder contains a speech recognition recipe which is based on WSJ/Librispeech example. + +For more details about Zeroth project, please visit: +https://github.com/goodatlas/zeroth diff --git a/egs/zeroth_korean/s5/RESULTS b/egs/zeroth_korean/s5/RESULTS new file mode 100644 index 00000000000..9255ec17673 --- /dev/null +++ b/egs/zeroth_korean/s5/RESULTS @@ -0,0 +1,63 @@ +#!/bin/bash + +# this RESULTS file was obtained by Wonkyum Lee in July 2018. + +for dir in exp/*; do + steps/info/gmm_dir_info.pl $dir + for x in $dir/decode*test*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done +done +exit 0 + +# monophone, trained on the 2k shortest utterances +exp/mono: nj=16 align prob=-99.85 over 2.66h [retry=0.8%, fail=0.3%] states=130 gauss=1004 +%WER 70.24 [ 6499 / 9253, 295 ins, 1399 del, 4805 sub ] exp/mono/decode_nosp_fglarge_test_clean/wer_8_0.5 +%WER 71.28 [ 6596 / 9253, 185 ins, 1721 del, 4690 sub ] exp/mono/decode_nosp_tglarge_test_clean/wer_9_1.0 +%WER 78.83 [ 7294 / 9253, 218 ins, 1752 del, 5324 sub ] exp/mono/decode_nosp_tgsmall_test_clean/wer_10_0.0 + +# first triphone build, trained on 5k utterances +exp/tri1: nj=16 align prob=-98.34 over 11.55h [retry=1.6%, fail=0.6%] states=1568 gauss=10030 tree-impr=4.07 +%WER 37.44 [ 3464 / 9253, 258 ins, 725 del, 2481 sub ] exp/tri1/decode_nosp_fglarge_test_clean/wer_15_0.5 +%WER 38.85 [ 3595 / 9253, 347 ins, 633 del, 2615 sub ] exp/tri1/decode_nosp_tglarge_test_clean/wer_15_0.0 +%WER 53.23 [ 4925 / 9253, 296 ins, 1060 del, 3569 sub ] exp/tri1/decode_nosp_tgsmall_test_clean/wer_15_0.0 + +# tri2 is an LDA+MLLT systemm, trained on 10k utterances +exp/tri2: nj=16 align prob=-49.63 over 23.00h [retry=1.7%, fail=0.8%] states=2000 gauss=15039 tree-impr=4.70 lda-sum=18.11 mllt:impr,logdet=0.99,1.39 +%WER 33.50 [ 3100 / 9253, 248 ins, 626 del, 2226 sub ] exp/tri2/decode_nosp_fglarge_test_clean/wer_16_0.5 +%WER 34.55 [ 3197 / 9253, 315 ins, 537 del, 2345 sub ] exp/tri2/decode_nosp_tglarge_test_clean/wer_16_0.0 +%WER 48.98 [ 4532 / 9253, 303 ins, 903 del, 3326 sub ] exp/tri2/decode_nosp_tgsmall_test_clean/wer_14_0.0 + +# tri3 is an LDA+MLLT+SAT system, trained on entire clean training set +exp/tri3: nj=16 align prob=-48.95 over 51.22h [retry=1.6%, fail=0.7%] states=3336 gauss=40065 fmllr-impr=2.72 over 19.18h tree-impr=7.23 +%WER 23.89 [ 2211 / 9253, 233 ins, 404 del, 1574 sub ] exp/tri3/decode_nosp_fglarge_test_clean/wer_15_0.0 +%WER 24.47 [ 2264 / 9253, 252 ins, 385 del, 1627 sub ] exp/tri3/decode_nosp_tglarge_test_clean/wer_13_0.0 +%WER 37.81 [ 3499 / 9253, 274 ins, 671 del, 2554 sub ] exp/tri3/decode_nosp_tgsmall_test_clean/wer_13_0.0 +%WER 49.00 [ 4534 / 9253, 302 ins, 874 del, 3358 sub ] exp/tri3/decode_nosp_tgsmall_test_clean.si/wer_14_0.0 +%WER 21.68 [ 2006 / 9253, 226 ins, 346 del, 1434 sub ] exp/tri3/decode_fglarge_test_clean/wer_15_0.0 +%WER 22.59 [ 2090 / 9253, 231 ins, 372 del, 1487 sub ] exp/tri3/decode_tglarge_test_clean/wer_15_0.0 +%WER 34.83 [ 3223 / 9253, 294 ins, 605 del, 2324 sub ] exp/tri3/decode_tgsmall_test_clean/wer_12_0.0 +%WER 45.28 [ 4190 / 9253, 270 ins, 880 del, 3040 sub ] exp/tri3/decode_tgsmall_test_clean.si/wer_15_0.0 + +# tri4 is an LDA+MLLT+SAT system after estimating pronunciation probabilities +# and word-and-pronunciation-dependent silence probabilities. +exp/tri4: nj=16 align prob=-48.70 over 51.22h [retry=1.5%, fail=0.7%] states=3368 gauss=40039 fmllr-impr=0.23 over 42.91h tree-impr=7.87 +%WER 21.61 [ 2000 / 9253, 210 ins, 379 del, 1411 sub ] exp/tri4/decode_fglarge_test_clean/wer_14_0.5 +%WER 22.59 [ 2090 / 9253, 237 ins, 371 del, 1482 sub ] exp/tri4/decode_tglarge_test_clean/wer_15_0.0 +%WER 34.57 [ 3199 / 9253, 285 ins, 595 del, 2319 sub ] exp/tri4/decode_tgsmall_test_clean/wer_12_0.0 +%WER 45.82 [ 4240 / 9253, 270 ins, 833 del, 3137 sub ] exp/tri4/decode_tgsmall_test_clean.si/wer_13_0.0 + +for dir in exp/chain/tdnn*_sp; do + steps/info/chain_dir_info.pl $dir + for x in ${dir}_online/decode*test*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done +done +exit 0 + +# tdnn_1a is a kind of factorized TDNN, with skip connections. +exp/chain/tdnn1b_sp: num-iters=174 nj=2..8 num-params=12.9M dim=40+100->3040 combine=-0.041->-0.041 (over 2) xent:train/valid[115,173,final]=(-1.14,-0.759,-0.751/-1.14,-0.788,-0.777) logprob:train/valid[115,173,final]=(-0.084,-0.047,-0.046/-0.080,-0.050,-0.048) +%WER 10.55 [ 976 / 9253, 122 ins, 166 del, 688 sub ] exp/chain/tdnn1b_sp_online/decode_fglarge_test_clean/wer_13_1.0 +%WER 17.65 [ 1633 / 9253, 208 ins, 233 del, 1192 sub ] exp/chain/tdnn1b_sp_online/decode_tgsmall_test_clean/wer_10_0.0 + +# This chain system has TDNN+Norm-OPGRU architecture. +exp/chain/tdnn_opgru1a_sp: num-iters=99 nj=2..12 num-params=38.0M dim=40+100->3040 combine=-0.045->-0.045 (over 1) xent:train/valid[65,98,final]=(-1.18,-0.663,-0.651/-1.21,-0.698,-0.684) logprob:train/valid[65,98,final]=(-0.079,-0.038,-0.037/-0.076,-0.040,-0.039) +%WER 9.45 [ 874 / 9253, 109 ins, 159 del, 606 sub ] exp/chain/tdnn_opgru1a_sp_online/decode_fglarge_test_clean/wer_10_1.0 +%WER 15.22 [ 1408 / 9253, 175 ins, 196 del, 1037 sub ] exp/chain/tdnn_opgru1a_sp_online/decode_tgsmall_test_clean/wer_8_0.0 + diff --git a/egs/zeroth_korean/s5/cmd.sh b/egs/zeroth_korean/s5/cmd.sh new file mode 100644 index 00000000000..34031439792 --- /dev/null +++ b/egs/zeroth_korean/s5/cmd.sh @@ -0,0 +1,17 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +export normalize_cmd="queue.pl --mem 4G" + diff --git a/egs/zeroth_korean/s5/conf/decode.config b/egs/zeroth_korean/s5/conf/decode.config new file mode 100644 index 00000000000..7ba966f2b83 --- /dev/null +++ b/egs/zeroth_korean/s5/conf/decode.config @@ -0,0 +1 @@ +# empty config, just use the defaults. diff --git a/egs/zeroth_korean/s5/conf/mfcc.conf b/egs/zeroth_korean/s5/conf/mfcc.conf new file mode 100644 index 00000000000..7361509099f --- /dev/null +++ b/egs/zeroth_korean/s5/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/zeroth_korean/s5/conf/mfcc_hires.conf b/egs/zeroth_korean/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/zeroth_korean/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/zeroth_korean/s5/conf/online_cmvn.conf b/egs/zeroth_korean/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/zeroth_korean/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/zeroth_korean/s5/local/chain/compare_wer.sh b/egs/zeroth_korean/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..e8366bfb358 --- /dev/null +++ b/egs/zeroth_korean/s5/local/chain/compare_wer.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER test_clean (tgsmall) " + "#WER test_clean (fglarge) ") + +for n in 0 1 ; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_test_clean fglarge_test_clean) + + wer=$(grep WER ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/zeroth_korean/s5/local/chain/run_tdnn.sh b/egs/zeroth_korean/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/zeroth_korean/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/zeroth_korean/s5/local/chain/run_tdnn_opgru.sh b/egs/zeroth_korean/s5/local/chain/run_tdnn_opgru.sh new file mode 120000 index 00000000000..aedd4c8b4ac --- /dev/null +++ b/egs/zeroth_korean/s5/local/chain/run_tdnn_opgru.sh @@ -0,0 +1 @@ +tuning/run_tdnn_opgru_1a.sh \ No newline at end of file diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..55e046dd55a --- /dev/null +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,290 @@ +#!/bin/bash + +set -e -o pipefail + +# This recipe trains TDNN-F AM +# The training recipe is from WSJ example(egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh) + +# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp +# exp/chain/tdnn1b_sp: num-iters=174 nj=2..8 num-params=12.9M dim=40+100->3040 combine=-0.041->-0.041 (over 2) xent:train/valid[115,173,final]=(-1.14,-0.759,-0.751/-1.14,-0.788,-0.777) logprob:train/valid[115,173,final]=(-0.084,-0.047,-0.046/-0.080,-0.050,-0.048) + +# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp +# System tdnn1b_sp +#WER test_clean (tgsmall) 17.65 +#WER test_clean (fglarge) 10.55 +# Final train prob -0.0460 +# Final valid prob -0.0480 +# Final train prob (xent) -0.7512 +# Final valid prob (xent) -0.7769 +# Num-params 12922560 + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_clean +speed_perturb=true +test_sets="test_clean" +gmm=tri4 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1a #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1280 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1280 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1280 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l 3040 combine=-0.045->-0.045 (over 1) xent:train/valid[65,98,final]=(-1.19,-0.661,-0.647/-1.21,-0.696,-0.680) logprob:train/valid[65,98,final]=(-0.080,-0.039,-0.038/-0.076,-0.039,-0.038) + +# ./local/chain/compare_wer.sh exp/chain/tdnn_opgru1a_sp +# System tdnn_opgru1a_sp +#WER test_clean (tgsmall) 15.22 +#WER test_clean (fglarge) 9.45 +# Final train prob -0.0373 +# Final valid prob -0.0386 +# Final train prob (xent) -0.6506 +# Final valid prob (xent) -0.6837 +# Num-params 37970368 + + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_clean +speed_perturb=true +test_sets="test_clean" +gmm=tri4 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1a #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= + +# OPGRU/chain options +train_stage=-10 +get_egs_stage=-10 + +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.2@0.50,0' + +chunk_width=140,100,160 +label_delay=5 + +remove_egs=true + + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + gru_opts="dropout-per-frame=true dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/gru.py for the other options and defaults + norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 + norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + + ## adding the layers for chain branch + output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context 40 \ + --egs.chunk-right-context 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.backstitch-training-scale 0.3 \ + --trainer.optimization.backstitch-training-interval 1 \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs=8 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir + +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l " + echo "e.g.: $0 ./db/train_data_01 data/train_data_01" + exit 1 +fi + +db_dir=$1 +data_part=$2 + +src=${db_dir}/${data_part} +dst=data/${data_part} + +# all utterances are FLAC compressed +if ! which flac >&/dev/null; then + echo "Please install 'flac' on ALL worker nodes!" + exit 1 +fi + +spk_file=${db_dir}/AUDIO_INFO + +mkdir -p $dst || exit 1; + +[ ! -d $src ] && echo "$0: no such directory $src" && exit 1; +[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1; + +wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp +trans=$dst/text; [[ -f "$trans" ]] && rm $trans +utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk +spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender +utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur + +for scriptid_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do + scriptid=$(basename $scriptid_dir) + if ! [ $scriptid -eq $scriptid ]; then # not integer. + echo "$0: unexpected subdirectory name $scriptid" + exit 1; + fi + + for reader_dir in $(find -L $scriptid_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do + reader=$(basename $reader_dir) + if ! [ "$reader" -eq "$reader" ]; then + echo "$0: unexpected reader-subdirectory name $reader" + exit 1; + fi + + reader_gender=$(egrep "^$reader\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($3)}') + if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then + echo "Unexpected gender: '$reader_gender'" + exit 1; + fi + + echo " "$scriptid $reader $reader_gender + + find -L $reader_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ + awk -v "dir=$reader_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1 + + reader_trans=$reader_dir/${reader}_${scriptid}.trans.txt + [ ! -f $reader_trans ] && echo "$0: expected file $reader_trans to exist" && exit 1 + cat $reader_trans >>$trans + + # NOTE: Each chapter is dedicated to each speaker. + awk -v "reader=$reader" -v "scriptid=$scriptid" '{printf "%s %s_%s\n", $1, reader, scriptid}' \ + <$reader_trans >>$utt2spk || exit 1 + + # reader -> gender map (again using per-chapter granularity) + echo "${reader}_${scriptid} $reader_gender" >>$spk2gender + + done +done + +# sort +cat $wav_scp | sort > tmp +cp tmp $wav_scp +cat $trans | sort > tmp +cp tmp $trans +cat $utt2spk | sort > tmp +cp tmp $utt2spk +cat $spk2gender | sort > tmp +cp tmp $spk2gender +rm tmp + + +spk2utt=$dst/spk2utt +utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 + +ntrans=$(wc -l <$trans) +nutt2spk=$(wc -l <$utt2spk) +! [ "$ntrans" -eq "$nutt2spk" ] && \ + echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; + +utils/data/get_utt2dur.sh $dst 1>&2 || exit 1 + +utils/validate_data_dir.sh --no-feats $dst || exit 1; + +echo "$0: successfully prepared data in $dst" + +exit 0 diff --git a/egs/zeroth_korean/s5/local/download_and_untar.sh b/egs/zeroth_korean/s5/local/download_and_untar.sh new file mode 100755 index 00000000000..2e62a3273d4 --- /dev/null +++ b/egs/zeroth_korean/s5/local/download_and_untar.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Copyright 2018 Lucas Jo (Atlas Guide) +# 2018 Wonkyum Lee (Gridspace) +# Apache 2.0 + +if [ $# -ne "1" ]; then + echo "Usage: $0 " + echo "e.g.: $0 ./db" + exit 1 +fi + +exists(){ + command -v "$1" >/dev/null 2>&1 +} + + +dir=$1 +local_lm_dir=data/local/lm + +AUDIOINFO='AUDIO_INFO' +AUDIOLIST='train_data_01 test_data_01' + +echo "Now download corpus ----------------------------------------------------" +if [ ! -f $dir/db.tar.gz ]; then + if [ ! -d $dir ]; then + mkdir -p $dir + fi + wget -O $dir/db.tar.gz http://www.openslr.org/resources/40/zeroth_korean.tar.gz +else + echo " $dir/db.tar.gz already exist" +fi + +echo "Now extract corpus ----------------------------------------------------" +if [ ! -f $dir/$AUDIOINFO ]; then + tar -zxvf $dir/db.tar.gz -C $dir + else + echo " corpus already extracted" +fi + +if [ ! -d $local_lm_dir ]; then + mkdir -p $local_lm_dir +fi +echo "Check LMs files" +LMList="\ + zeroth.lm.fg.arpa.gz \ + zeroth.lm.tg.arpa.gz \ + zeroth.lm.tgmed.arpa.gz \ + zeroth.lm.tgsmall.arpa.gz \ + zeroth_lexicon \ + zeroth_morfessor.seg" + +for file in $LMList; do + if [ -f $local_lm_dir/$file ]; then + echo $file already exist + else + echo "Linking "$file + ln -s $PWD/$dir/$file $local_lm_dir/$file + fi +done +echo "all the files (lexicon, LM, segment model) are ready" diff --git a/egs/zeroth_korean/s5/local/format_lms.sh b/egs/zeroth_korean/s5/local/format_lms.sh new file mode 100755 index 00000000000..a9111e80eeb --- /dev/null +++ b/egs/zeroth_korean/s5/local/format_lms.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2014 Vassil Panayotov +# Apache 2.0 + +# Prepares the test time language model(G) transducers +# (adapted from wsj/s5/local/wsj_format_data.sh) + +# Modified by Lucas Jo 2017 (Altas Guide) + +. ./path.sh || exit 1; + +# begin configuration section +src_dir=data/lang +# end configuration section + +. utils/parse_options.sh || exit 1; + +set -e + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /export/a15/vpanayotov/data/lm" + echo ", where:" + echo " is the directory in which the language model is stored/downloaded" + echo "Options:" + echo " --src-dir # source lang directory, default data/lang" + exit 1 +fi + +lm_dir=$1 + +if [ ! -d $lm_dir ]; then + echo "$0: expected source LM directory $lm_dir to exist" + exit 1; +fi +if [ ! -f $src_dir/words.txt ]; then + echo "$0: expected $src_dir/words.txt to exist." + exit 1; +fi + + +tmpdir=data/local/lm_tmp.$$ +trap "rm -r $tmpdir" EXIT + +mkdir -p $tmpdir + +#lm_sets="tgsmall tgmed" +lm_sets="tgsmall" +for lm_suffix in ${lm_sets}; do + # tglarge is prepared by a separate command, called from run.sh; we don't + # want to compile G.fst for tglarge, as it takes a while. + test=${src_dir}_test_${lm_suffix} + mkdir -p $test + cp -r ${src_dir}/* $test + gunzip -c $lm_dir/zeroth.lm.${lm_suffix}.arpa.gz | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst + + utils/validate_lang.pl --skip-determinization-check $test || exit 1; +done + +echo "Succeeded in formatting data." + +exit 0 diff --git a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..70be96310e1 --- /dev/null +++ b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# this script contains some common (shared) parts of the run_nnet*.sh scripts. +. cmd.sh + + +stage=0 +gmmdir=exp/tri4 +speed_perturb=false +trainset=train_clean + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + echo "$0: preparing directory for speed-perturbed data" + #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + for datadir in ${trainset} ; do + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 \ + data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_sp + done + fi + + if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/${trainset}_sp data/lang_nosp ${gmmdir} ${gmmdir}_ali_${trainset}_sp || exit 1 + fi + trainset=${trainset}_sp +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + + echo "$0: creating high-resolution MFCC features" + for datadir in ${trainset} ; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + done + + # We need to build a small system just because we need PCA transform + # to train the diag-UBM on top of. + utils/subset_data_dir.sh data/${trainset}_hires 30000 data/train_30k_hires +fi + + +if [ $stage -le 4 ]; then + # Train a small system just for its PCA transform. + echo "$0: computing a PCA transform from the hires data." + mkdir exp -p exp/nnet3 + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + --max-utts 30000 --subsample 2 \ + data/train_30k_hires exp/nnet3/pca_transform +fi + +if [ $stage -le 5 ]; then + # To train a diagonal UBM we don't need very much data, so use a small subset + echo "$0: training the diagonal UBM." + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \ + data/train_30k_hires 512 exp/nnet3/pca_transform exp/nnet3/diag_ubm +fi + +if [ $stage -le 6 ]; then + # Train the iVector extractor. Use all of the speed-perturbed data since iVector extractors + # can be sensitive to the amount of data. The script defaults to an iVector dimension of 100 + echo "$0: training the iVector extractor" + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${trainset}_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + ivectordir=exp/nnet3/ivectors_${trainset}_hires + + # We extract iVectors on all the train data, which will be what we train the + # system on. With --utts-per-spk-max 2, the script. pairs the utterances + # into twos, and treats each of these pairs as one speaker. Note that these + # are extracted 'online'. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + echo "$0: extracing iVector using trained iVector extractor" + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ + data/${trainset}_hires data/${trainset}_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \ + data/${trainset}_hires_max2 exp/nnet3/extractor $ivectordir || exit 1; +fi + + +exit 0; diff --git a/egs/zeroth_korean/s5/local/prepare_dict.sh b/egs/zeroth_korean/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..76c6821e11e --- /dev/null +++ b/egs/zeroth_korean/s5/local/prepare_dict.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2014 Vassil Panayotov +# Apache 2.0 + +# Modified by Lucas Jo 2017 (Altas Guide) +# Prepare dictionary + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "e.g.: /data/local/lm data/local/dict_nosp" + exit 1 +fi +lm_dir=$1 +dst_dir=$2 + +mkdir -p $dst_dir || exit 1; + +# this file is a copy of the lexicon we obtained from download_lm.sh process +lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt + +if [[ ! -s "$lexicon_raw_nosil" ]]; then + cp $lm_dir/zeroth_lexicon $lexicon_raw_nosil || exit 1 +fi + +silence_phones=$dst_dir/silence_phones.txt +optional_silence=$dst_dir/optional_silence.txt +nonsil_phones=$dst_dir/nonsilence_phones.txt +extra_questions=$dst_dir/extra_questions.txt + +echo "Preparing phone lists and clustering questions" +(echo SIL; echo SPN;) > $silence_phones +#( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $silence_phones +echo SIL > $optional_silence +# nonsilence phones; on each line is a list of phones that correspond +# really to the same base phone. +awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\ + sort -u |\ + perl -e 'while(<>){ + chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; + $phones_of{$1} .= "$_ "; } + foreach $list (values %phones_of) {print $list . "\n"; } ' \ + > $nonsil_phones || exit 1; +# A few extra questions that will be added to those obtained by +# automatically clustering +# the "real" phones. These ask about stress; there's also one for +# silence. +cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1; +cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)){ +$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $extra_questions || exit 1; + +echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones" +echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence" +echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones" +echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions" + +#(echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH'; +# echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH' +# echo ' NSN' ) | \ +(echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) |\ + cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt +echo "Lexicon text file saved as: $dst_dir/lexicon.txt" +exit 0 + diff --git a/egs/zeroth_korean/s5/local/score.sh b/egs/zeroth_korean/s5/local/score.sh new file mode 100755 index 00000000000..c812199fc98 --- /dev/null +++ b/egs/zeroth_korean/s5/local/score.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# 2014 Guoguo Chen +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt + +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab \ + ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1; +done + +# Note: the double level of quoting for the sed command +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \ + cat $dir/scoring/LMWT.$wip.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; +done + +exit 0; diff --git a/egs/zeroth_korean/s5/local/update_segmentation.sh b/egs/zeroth_korean/s5/local/update_segmentation.sh new file mode 100755 index 00000000000..e1eea821645 --- /dev/null +++ b/egs/zeroth_korean/s5/local/update_segmentation.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright 2017 Lucas Jo (Atlas Guide) +# Apache 2.0 + +# do this when the segmentation rule is changed +dataDir=$1 +lmDir=$2 + +exists(){ + command -v "$1" >/dev/null 2>&1 +} + +# check morfessor installation +if ! exists morfessor; then + echo "You appear to not have Morfessor installed, either on your path." + echo "See tools/extras/install_morfessor.sh installation instructions." + exit 1 +fi + +trans=$dataDir/text +echo "Re-segment transcripts: $trans --------------------------------------------" +if [ ! -f $trans ]; then + echo "transcription file is not found in "$dataDir + exit 1 +fi +cp $trans $trans".old" +awk '{print $1}' $trans".old" > $trans"_tmp_index" +cut -d' ' -f2- $trans".old" |\ + sed -E 's/\s+/ /g; s/^\s//g; s/\s$//g' |\ + morfessor -e 'utf-8' -l $lmDir/zeroth_morfessor.seg -T - -o - \ + --output-format '{analysis} ' --output-newlines \ + --nosplit-re '[0-9\[\]\(\){}a-zA-Z&.,\-]+' \ + | paste -d" " $trans"_tmp_index" - > $trans +rm -f $trans"_tmp_index" + diff --git a/egs/zeroth_korean/s5/path.sh b/egs/zeroth_korean/s5/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/zeroth_korean/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh new file mode 100755 index 00000000000..c5c7506980b --- /dev/null +++ b/egs/zeroth_korean/s5/run.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# +# Based mostly on the WSJ/Librispeech recipe. +# The training/testing database is described in http://www.openslr.org/40/ +# This corpus consists of 51hrs korean speech with cleaned automatic transcripts: +# +# Copyright 2018 Atlas Guide (Author : Lucas Jo) +# 2018 Gridspace Inc. (Author: Wonkyum Lee) +# +# Apache 2.0 +# + +# Check list before start +# 1. required software: Morfessor-2.0.1 (see tools/extras/install_morfessor.sh) + +stage=0 +db_dir=./db +nj=16 + +chain_train=true +decode=true # set false if you don't want to decode each GMM model +decode_rescoring=true # set false if you don't want to rescore with large language model +test_set="test_clean" + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh # e.g. this parses the --stage option if supplied. + +# you might not want to do this for interactive shells. +set -e + +if [ $stage -le 0 ]; then + # download the data. + local/download_and_untar.sh $db_dir +fi + +if [ $stage -le 1 ]; then + # format the data as Kaldi data directories + for part in train_data_01 test_data_01; do + # use underscore-separated names in data directories. + local/data_prep.sh $db_dir $part + done +fi + +if [ $stage -le 2 ]; then + # update segmentation of transcripts + for part in train_data_01 test_data_01; do + local/update_segmentation.sh data/$part data/local/lm + done +fi + +if [ $stage -le 3 ]; then + # prepare dictionary and language model + local/prepare_dict.sh data/local/lm data/local/dict_nosp + + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_tmp_nosp data/lang_nosp +fi + +if [ $stage -le 4 ]; then + # build testing language model + local/format_lms.sh --src-dir data/lang_nosp data/local/lm + + # re-scoring language model + if $decode_rescoring ; then + utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.tg.arpa.gz \ + data/lang_nosp data/lang_nosp_test_tglarge + utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.fg.arpa.gz \ + data/lang_nosp data/lang_nosp_test_fglarge + fi +fi + + +if [ $stage -le 5 ]; then + # Feature extraction (MFCC) + mfccdir=mfcc + for part in train_data_01 test_data_01; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$part exp/make_mfcc/$part $mfccdir + steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir + done + + # ... and then combine data sets into one (for later extension) + utils/combine_data.sh \ + data/train_clean data/train_data_01 + + utils/combine_data.sh \ + data/test_clean data/test_data_01 + + # Make some small data subsets for early system-build stages. + utils/subset_data_dir.sh --shortest data/train_clean 2000 data/train_2kshort + utils/subset_data_dir.sh data/train_clean 5000 data/train_5k + utils/subset_data_dir.sh data/train_clean 10000 data/train_10k +fi + +if [ $stage -le 6 ]; then + echo "$0: #### Monophone Training ###########" + # train a monophone system with 2k short utts + steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ + data/train_2kshort data/lang_nosp exp/mono + if $decode; then + utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/mono exp/mono/graph_nosp_tgsmall + nspk=$(wc -l " data/local/lang_tmp data/lang + + local/format_lms.sh --src-dir data/lang data/local/lm + + utils/build_const_arpa_lm.sh \ + data/local/lm/zeroth.lm.tg.arpa.gz data/lang data/lang_test_tglarge + utils/build_const_arpa_lm.sh \ + data/local/lm/zeroth.lm.fg.arpa.gz data/lang data/lang_test_fglarge + + if $decode; then + utils/mkgraph.sh data/lang_test_tgsmall exp/tri3 exp/tri3/graph_tgsmall + nspk=$(wc -l