From 13071f1ef2a2a6ec91e5adeb3edd2f05beb05c61 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Mon, 19 Mar 2018 22:56:15 -0700 Subject: [PATCH 01/26] initial setting --- egs/zeroth_korean/s5/RESULTS | 24 +++++++++++++++++++++ egs/zeroth_korean/s5/cmd.sh | 25 ++++++++++++++++++++++ egs/zeroth_korean/s5/conf/decode.config | 1 + egs/zeroth_korean/s5/conf/mfcc.conf | 1 + egs/zeroth_korean/s5/conf/mfcc_hires.conf | 10 +++++++++ egs/zeroth_korean/s5/conf/online_cmvn.conf | 1 + egs/zeroth_korean/s5/conf/queue.conf | 10 +++++++++ egs/zeroth_korean/s5/path.sh | 6 ++++++ egs/zeroth_korean/s5/steps | 1 + egs/zeroth_korean/s5/utils | 1 + 10 files changed, 80 insertions(+) create mode 100644 egs/zeroth_korean/s5/RESULTS create mode 100644 egs/zeroth_korean/s5/cmd.sh create mode 100644 egs/zeroth_korean/s5/conf/decode.config create mode 100644 egs/zeroth_korean/s5/conf/mfcc.conf create mode 100644 egs/zeroth_korean/s5/conf/mfcc_hires.conf create mode 100644 egs/zeroth_korean/s5/conf/online_cmvn.conf create mode 100644 egs/zeroth_korean/s5/conf/queue.conf create mode 100755 egs/zeroth_korean/s5/path.sh create mode 120000 egs/zeroth_korean/s5/steps create mode 120000 egs/zeroth_korean/s5/utils diff --git a/egs/zeroth_korean/s5/RESULTS b/egs/zeroth_korean/s5/RESULTS new file mode 100644 index 00000000000..8a189e3f501 --- /dev/null +++ b/egs/zeroth_korean/s5/RESULTS @@ -0,0 +1,24 @@ +# In the results below, "tgsmall" is the pruned 3-gram LM, which is used for lattice generation. +# The following language models are then used for rescoring: +# a) tglarge- the full, non-pruned 3-gram LM +# b) fglarge- non-pruned 4-gram LM +# The "test-clean" sets generally contain, relatively cleaner Korean speech, +# the "test_200" are subset of "test-clean", designed for quick evaluation + +### SAT GMM model trained on the "train-01" set (51 hours "clean" speech) +decode_fglarge_test_200/wer_14_0.5:%WER 21.17 [ 873 / 4124, 93 ins, 172 del, 608 sub ] +decode_tglarge_test_200/wer_15_0.0:%WER 21.46 [ 885 / 4124, 101 ins, 168 del, 616 sub ] +decode_tgsmall_test_200/wer_14_0.5:%WER 33.83 [ 1395 / 4124, 85 ins, 330 del, 980 sub ] +decode_tgsmall_test_200.si/wer_14_0.0:%WER 46.02 [ 1898 / 4124, 133 ins, 389 del, 1376 sub ] + +### Chain model trained on the "train-01" set +tdnn1n_online/decode_fglarge_test_200/wer_13_1.0:%WER 11.25 [ 464 / 4124, 65 ins, 78 del, 321 sub ] +tdnn1n_online/decode_tgsmall_test_200/wer_13_0.0:%WER 18.09 [ 746 / 4124, 89 ins, 123 del, 534 sub ] +tdnn_opgru_1c_sp_online/decode_fglarge_test_200/wer_8_1.0:%WER 9.00 [ 371 / 4124, 50 ins, 63 del, 258 sub ] +tdnn_opgru_1c_sp_online/decode_tgsmall_test_200/wer_8_0.5:%WER 14.06 [ 580 / 4124, 62 ins, 92 del, 426 sub ] + +### Chain model trained on the "train-01" set with multi-conditioned data augmentation +tdnn1n_rvb_online/decode_fglarge_test_200/wer_10_0.0:%WER 10.11 [ 417 / 4124, 73 ins, 57 del, 287 sub ] +tdnn1n_rvb_online/decode_tgsmall_test_200/wer_8_0.5:%WER 16.27 [ 671 / 4124, 87 ins, 91 del, 493 sub ] +tdnn_lstm_1e_rvb_online/decode_fglarge_test_200/wer_13_0.0:%WER 11.47 [ 473 / 4124, 74 ins, 61 del, 338 sub ] +tdnn_lstm_1e_rvb_online/decode_tgsmall_test_200/wer_12_1.0:%WER 16.97 [ 700 / 4124, 72 ins, 130 del, 498 sub ] diff --git a/egs/zeroth_korean/s5/cmd.sh b/egs/zeroth_korean/s5/cmd.sh new file mode 100644 index 00000000000..1687940f7d1 --- /dev/null +++ b/egs/zeroth_korean/s5/cmd.sh @@ -0,0 +1,25 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="run.pl --mem 2G" +export decode_cmd="run.pl --mem 4G" +export mkgraph_cmd="run.pl --mem 8G" +export normalize_cmd="run.pl --mem 4G" + +hostInAtlas="ares hephaestus jupiter neptune" +if [[ ! -z $(echo $hostInAtlas | grep -o $(hostname -f)) ]]; then + queue_conf=conf/queue.conf + export train_cmd="queue.pl --config $queue_conf --mem 4G" + export decode_cmd="queue.pl --config $queue_conf --mem 8G" + export mkgraph_cmd="queue.pl --config $queue_conf --mem 16G" + export normalize_cmd="queue.pl --config $queue_conf --mem 4G" +fi diff --git a/egs/zeroth_korean/s5/conf/decode.config b/egs/zeroth_korean/s5/conf/decode.config new file mode 100644 index 00000000000..7ba966f2b83 --- /dev/null +++ b/egs/zeroth_korean/s5/conf/decode.config @@ -0,0 +1 @@ +# empty config, just use the defaults. diff --git a/egs/zeroth_korean/s5/conf/mfcc.conf b/egs/zeroth_korean/s5/conf/mfcc.conf new file mode 100644 index 00000000000..7361509099f --- /dev/null +++ b/egs/zeroth_korean/s5/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/zeroth_korean/s5/conf/mfcc_hires.conf b/egs/zeroth_korean/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/zeroth_korean/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/zeroth_korean/s5/conf/online_cmvn.conf b/egs/zeroth_korean/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/zeroth_korean/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/zeroth_korean/s5/conf/queue.conf b/egs/zeroth_korean/s5/conf/queue.conf new file mode 100644 index 00000000000..2aa9ee6a211 --- /dev/null +++ b/egs/zeroth_korean/s5/conf/queue.conf @@ -0,0 +1,10 @@ +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 +option gpu=* -l gpu=$0 diff --git a/egs/zeroth_korean/s5/path.sh b/egs/zeroth_korean/s5/path.sh new file mode 100755 index 00000000000..91c09618924 --- /dev/null +++ b/egs/zeroth_korean/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=ko_KR.UTF-8 diff --git a/egs/zeroth_korean/s5/steps b/egs/zeroth_korean/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/zeroth_korean/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/zeroth_korean/s5/utils b/egs/zeroth_korean/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/zeroth_korean/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file From d2856ba5614267bdb32c611aab97e3040b96a9fb Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Tue, 20 Mar 2018 11:30:09 -0700 Subject: [PATCH 02/26] main script --- .../chain/multi_condition/run_tdnn_1b.sh | 299 ++++++++++++++++ .../chain/multi_condition/run_tdnn_1n.sh | 302 ++++++++++++++++ .../chain/multi_condition/run_tdnn_lstm_1e.sh | 328 ++++++++++++++++++ .../s5/local/chain/run_tdnn_1a.sh | 266 ++++++++++++++ .../s5/local/chain/run_tdnn_1b.sh | 271 +++++++++++++++ .../s5/local/chain/run_tdnn_1n.sh | 275 +++++++++++++++ .../s5/local/chain/run_tdnn_lstm_1e.sh | 290 ++++++++++++++++ .../s5/local/chain/run_tdnn_opgru_1c.sh | 291 ++++++++++++++++ egs/zeroth_korean/s5/local/data_prep.sh | 104 ++++++ .../s5/local/download_and_untar.sh | 61 ++++ egs/zeroth_korean/s5/local/format_lms.sh | 63 ++++ .../s5/local/multi_condition/copy_ali_dir.sh | 78 +++++ .../s5/local/nnet2/run_5a_recData01.sh | 76 ++++ .../s5/local/nnet2/run_5a_train_2x.sh | 105 ++++++ .../s5/local/nnet2/run_5a_train_clean.sh | 77 ++++ .../multi_condition/run_ivector_common.sh | 214 ++++++++++++ .../s5/local/nnet3/run_ivector_common.sh | 124 +++++++ .../local/online/export_online_nnet2_model.sh | 33 ++ .../s5/local/online/run_nnet2_common.sh | 101 ++++++ .../s5/local/online/run_nnet2_ms.sh | 267 ++++++++++++++ .../s5/local/online/run_nnet2_ms_disc.sh | 164 +++++++++ egs/zeroth_korean/s5/local/prepare_dict.sh | 65 ++++ egs/zeroth_korean/s5/local/score.sh | 63 ++++ .../s5/local/updateSegmentation.sh | 51 +++ egs/zeroth_korean/s5/run.sh | 194 +++++++++++ 25 files changed, 4162 insertions(+) create mode 100755 egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1b.sh create mode 100755 egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1n.sh create mode 100755 egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_lstm_1e.sh create mode 100755 egs/zeroth_korean/s5/local/chain/run_tdnn_1a.sh create mode 100755 egs/zeroth_korean/s5/local/chain/run_tdnn_1b.sh create mode 100755 egs/zeroth_korean/s5/local/chain/run_tdnn_1n.sh create mode 100755 egs/zeroth_korean/s5/local/chain/run_tdnn_lstm_1e.sh create mode 100755 egs/zeroth_korean/s5/local/chain/run_tdnn_opgru_1c.sh create mode 100755 egs/zeroth_korean/s5/local/data_prep.sh create mode 100755 egs/zeroth_korean/s5/local/download_and_untar.sh create mode 100755 egs/zeroth_korean/s5/local/format_lms.sh create mode 100755 egs/zeroth_korean/s5/local/multi_condition/copy_ali_dir.sh create mode 100755 egs/zeroth_korean/s5/local/nnet2/run_5a_recData01.sh create mode 100755 egs/zeroth_korean/s5/local/nnet2/run_5a_train_2x.sh create mode 100755 egs/zeroth_korean/s5/local/nnet2/run_5a_train_clean.sh create mode 100755 egs/zeroth_korean/s5/local/nnet3/multi_condition/run_ivector_common.sh create mode 100755 egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh create mode 100755 egs/zeroth_korean/s5/local/online/export_online_nnet2_model.sh create mode 100755 egs/zeroth_korean/s5/local/online/run_nnet2_common.sh create mode 100755 egs/zeroth_korean/s5/local/online/run_nnet2_ms.sh create mode 100755 egs/zeroth_korean/s5/local/online/run_nnet2_ms_disc.sh create mode 100755 egs/zeroth_korean/s5/local/prepare_dict.sh create mode 100755 egs/zeroth_korean/s5/local/score.sh create mode 100755 egs/zeroth_korean/s5/local/updateSegmentation.sh create mode 100755 egs/zeroth_korean/s5/run.sh diff --git a/egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1b.sh b/egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1b.sh new file mode 100755 index 00000000000..c8ebaeb2e05 --- /dev/null +++ b/egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1b.sh @@ -0,0 +1,299 @@ +#!/bin/bash + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_clean +num_data_reps=1 # number of reverberated copies of data to generate +speed_perturb=true +test_sets="test_200" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix=_rvb # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1b_rvb #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +max_param_change=2.0 + +# training chunk-options +chunk_width=150 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +num_jobs_initial=2 +num_jobs_final=12 +num_epochs=4 +minibatch_size=128 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +remove_egs=true + + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $clean_lat_dir + rm $clean_lat_dir/fsts.*.gz # save space + # Create the lattices for the reverberated data + + # We use the lattices/alignments from the clean data for the reverberated data. + mkdir -p $lat_dir/temp/ + lattice-copy "ark:gunzip -c $clean_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp + + # copy the lattices for the reverberated data + rm -f $lat_dir/temp/combined_lats.scp + touch $lat_dir/temp/combined_lats.scp + # Here prefix "rev0_" represents the clean set, "rev1_" represents the reverberated set + for i in `seq 0 $num_data_reps`; do + cat $lat_dir/temp/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp + done + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp + + lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; + echo "1" > $lat_dir/num_jobs + + # copy other files from original lattice dir + for f in cmvn_opts final.mdl splice_opts tree; do + cp $clean_lat_dir/$f $lat_dir/$f + done + +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand $srand \ + --trainer.max-param-change $max_param_change \ + --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 1500000 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --cleanup.remove-egs $remove_egs \ + --use-gpu true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3_rvb/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $clean_lat_dir + rm $clean_lat_dir/fsts.*.gz # save space + # Create the lattices for the reverberated data + + # We use the lattices/alignments from the clean data for the reverberated data. + mkdir -p $lat_dir/temp/ + lattice-copy "ark:gunzip -c $clean_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp + + # copy the lattices for the reverberated data + rm -f $lat_dir/temp/combined_lats.scp + touch $lat_dir/temp/combined_lats.scp + # Here prefix "rev0_" represents the clean set, "rev1_" represents the reverberated set + for i in `seq 0 $num_data_reps`; do + cat $lat_dir/temp/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp + done + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp + + lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; + echo "1" > $lat_dir/num_jobs + + # copy other files from original lattice dir + for f in cmvn_opts final.mdl splice_opts tree; do + cp $clean_lat_dir/$f $lat_dir/$f + done + +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.max-param-change $max_param_change \ + --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 1500000 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --egs.stage $get_egs_stage \ + --egs.chunk-width $chunk_width \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --cleanup.remove-egs $remove_egs \ + --use-gpu true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3_rvb/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $clean_lat_dir + rm $clean_lat_dir/fsts.*.gz # save space + # Create the lattices for the reverberated data + + # We use the lattices/alignments from the clean data for the reverberated data. + mkdir -p $lat_dir/temp/ + lattice-copy "ark:gunzip -c $clean_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp + + # copy the lattices for the reverberated data + rm -f $lat_dir/temp/combined_lats.scp + touch $lat_dir/temp/combined_lats.scp + # Here prefix "rev0_" represents the clean set, "rev1_" represents the reverberated set + for i in `seq 0 $num_data_reps`; do + cat $lat_dir/temp/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp + done + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp + + lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; + echo "1" > $lat_dir/num_jobs + + # copy other files from original lattice dir + for f in cmvn_opts final.mdl splice_opts tree; do + cp $clean_lat_dir/$f $lat_dir/$f + done + +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + + +if [ $stage -le 12 ]; then + + hostInAtlas="ares hephaestus jupiter neptune" + if [[ ! -z $(echo $hostInAtlas | grep -o $(hostname -f)) ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl /mnt/{ares,hephaestus,jupiter,neptune}/$USER/kaldi-data/zeroth-kaldi-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage \ + $dir/egs/storage + fi + #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + # utils/create_split_dir.pl \ + # /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + #fi + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change $max_param_change \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3_rvb/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand $srand \ + --trainer.max-param-change $max_param_change \ + --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 1500000 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --cleanup.remove-egs $remove_egs \ + --use-gpu true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand $srand \ + --trainer.max-param-change $max_param_change \ + --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 1500000 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --cleanup.remove-egs $remove_egs \ + --use-gpu true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.max-param-change $max_param_change \ + --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 1500000 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --egs.stage $get_egs_stage \ + --egs.chunk-width $chunk_width \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --cleanup.remove-egs $remove_egs \ + --use-gpu true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=20" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change $max_param_change \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 9 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + gru_opts="dropout-per-frame=true dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/gru.py for the other options and defaults + norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 + norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + + ## adding the layers for chain branch + output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.backstitch-training-scale 0.3 \ + --trainer.optimization.backstitch-training-interval 1 \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir + +fi + +if [ $stage -le 13 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgsmall/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if $test_online_decoding && [ $stage -le 14 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l " + echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" + exit 1 +fi + +src=$1 +dst=$2 + +# all utterances are FLAC compressed +if ! which flac >&/dev/null; then + echo "Please install 'flac' on ALL worker nodes!" + exit 1 +fi + +spk_file=$src/../AUDIO_INFO + +mkdir -p $dst || exit 1; + +[ ! -d $src ] && echo "$0: no such directory $src" && exit 1; +[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1; + +wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp +trans=$dst/text; [[ -f "$trans" ]] && rm $trans +utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk +spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender +utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur + +for scriptid_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do + scriptid=$(basename $scriptid_dir) + if ! [ $scriptid -eq $scriptid ]; then # not integer. + echo "$0: unexpected subdirectory name $scriptid" + exit 1; + fi + + for reader_dir in $(find -L $scriptid_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do + reader=$(basename $reader_dir) + if ! [ "$reader" -eq "$reader" ]; then + echo "$0: unexpected reader-subdirectory name $reader" + exit 1; + fi + + reader_gender=$(egrep "^$reader\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($3)}') + if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then + echo "Unexpected gender: '$reader_gender'" + exit 1; + fi + + echo " "$scriptid $reader $reader_gender + + find -L $reader_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ + awk -v "dir=$reader_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1 + + reader_trans=$reader_dir/${reader}_${scriptid}.trans.txt + [ ! -f $reader_trans ] && echo "$0: expected file $reader_trans to exist" && exit 1 + cat $reader_trans >>$trans + + # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered + # to be a different speaker. This is done for simplicity and because we want + # e.g. the CMVN to be calculated per-chapter + awk -v "reader=$reader" -v "scriptid=$scriptid" '{printf "%s %s_%s\n", $1, reader, scriptid}' \ + <$reader_trans >>$utt2spk || exit 1 + + # reader -> gender map (again using per-chapter granularity) + echo "${reader}_${scriptid} $reader_gender" >>$spk2gender + + done +done + +# sort +cat $wav_scp | sort > tmp +cp tmp $wav_scp +cat $trans | sort > tmp +cp tmp $trans +cat $utt2spk | sort > tmp +cp tmp $utt2spk +cat $spk2gender | sort > tmp +cp tmp $spk2gender +rm tmp + + +spk2utt=$dst/spk2utt +utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 + +ntrans=$(wc -l <$trans) +nutt2spk=$(wc -l <$utt2spk) +! [ "$ntrans" -eq "$nutt2spk" ] && \ + echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; + +utils/data/get_utt2dur.sh $dst 1>&2 || exit 1 + +utils/validate_data_dir.sh --no-feats $dst || exit 1; + +echo "$0: successfully prepared data in $dst" + +exit 0 diff --git a/egs/zeroth_korean/s5/local/download_and_untar.sh b/egs/zeroth_korean/s5/local/download_and_untar.sh new file mode 100755 index 00000000000..0b56bcb37b3 --- /dev/null +++ b/egs/zeroth_korean/s5/local/download_and_untar.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Copyright 2018 Lucas Jo (Atlas Guide) +# 2018 Wonkyum Lee (Gridspace) +# Apache 2.0 + +if [ $# -ne "1" ]; then + echo "Usage: $0 " + echo "e.g.: $0 ./db" + exit 1 +fi + +exists(){ + command -v "$1" >/dev/null 2>&1 +} + + +dir=$1 +local_lm_dir=data/local/lm + +AUDIOINFO='AUDIO_INFO' +AUDIOLIST='train_data_01 test_data_01' + +echo "Now download corpus ----------------------------------------------------" +if [ ! -f $dir/db.tar.gz ]; then + if [ ! -d $dir ]; then + mkdir -p $dir + fi + wget -O $dir/db.tar.gz https://storage.googleapis.com/zeroth_project/zeroth_korean.tar.gz +else + echo " $dir/db.tar.gz already exist" +fi + +echo "Now extract corpus ----------------------------------------------------" +if [ ! -f $dir/$AUDIOINFO ]; then + tar -zxvf $dir/db.tar.gz -C $dir + else + echo " corpus already extracted" +fi + +if [ ! -d $local_lm_dir ]; then + mkdir -p $local_lm_dir +fi +echo "Check LMs files" +LMList="\ + zeroth.lm.fg.arpa.gz \ + zeroth.lm.tg.arpa.gz \ + zeroth.lm.tgmed.arpa.gz \ + zeroth.lm.tgsmall.arpa.gz \ + zeroth_lexicon \ + zeroth_morfessor.seg" + +for file in $LMList; do + if [ -f $local_lm_dir/$file ]; then + echo $file already exist + else + echo "Linking "$file + ln -s $PWD/$dir/$file $local_lm_dir/$file + fi +done +echo "all the files (lexicon, LM, segment model) are ready" diff --git a/egs/zeroth_korean/s5/local/format_lms.sh b/egs/zeroth_korean/s5/local/format_lms.sh new file mode 100755 index 00000000000..5947ae6b620 --- /dev/null +++ b/egs/zeroth_korean/s5/local/format_lms.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Copyright 2014 Vassil Panayotov +# Apache 2.0 + +# Prepares the test time language model(G) transducers +# (adapted from wsj/s5/local/wsj_format_data.sh) + +# Modified by Lucas Jo 2017 (Altas Guide) + +. ./path.sh || exit 1; + +# begin configuration section +src_dir=data/lang +# end configuration section + +. utils/parse_options.sh || exit 1; + +set -e + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /export/a15/vpanayotov/data/lm" + echo ", where:" + echo " is the directory in which the language model is stored/downloaded" + echo "Options:" + echo " --src-dir # source lang directory, default data/lang" + exit 1 +fi + +lm_dir=$1 + +if [ ! -d $lm_dir ]; then + echo "$0: expected source LM directory $lm_dir to exist" + exit 1; +fi +if [ ! -f $src_dir/words.txt ]; then + echo "$0: expected $src_dir/words.txt to exist." + exit 1; +fi + + +tmpdir=data/local/lm_tmp.$$ +trap "rm -r $tmpdir" EXIT + +mkdir -p $tmpdir + +for lm_suffix in tgsmall tgmed; do + # tglarge is prepared by a separate command, called from run.sh; we don't + # want to compile G.fst for tglarge, as it takes a while. + test=${src_dir}_test_${lm_suffix} + mkdir -p $test + cp -r ${src_dir}/* $test + gunzip -c $lm_dir/zeroth.lm.${lm_suffix}.arpa.gz | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst + + utils/validate_lang.pl --skip-determinization-check $test || exit 1; +done + +echo "Succeeded in formatting data." + +exit 0 diff --git a/egs/zeroth_korean/s5/local/multi_condition/copy_ali_dir.sh b/egs/zeroth_korean/s5/local/multi_condition/copy_ali_dir.sh new file mode 100755 index 00000000000..42ea2dc4b9d --- /dev/null +++ b/egs/zeroth_korean/s5/local/multi_condition/copy_ali_dir.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Vijayaditya Peddinti) +# Apache 2.0 + +# This script operates on a directory, such as in exp/tri4a_ali, +# that contains some subset of the following files: +# ali.*.gz +# tree +# cmvn_opts +# splice_opts +# num_jobs +# final.mdl +# It copies to another directory, possibly adding a specified prefix or a suffix +# to the utterance names. + + +# begin configuration section +utt_prefix= +utt_suffix= +cmd=run.pl +# end configuration section + +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --utt-prefix=1- exp/tri4a_ali exp/tri4a_rev1_ali" + echo "Options" + echo " --utt-prefix= # Prefix for utterance ids, default empty" + echo " --utt-suffix= # Suffix for utterance ids, default empty" + exit 1; +fi + + +export LC_ALL=C + +src_dir=$1 +dest_dir=$2 + +mkdir -p $dest_dir + +if [ ! -f $src_dir/ali.1.gz ]; then + echo "copy_ali_dir.sh: no such files $src_dir/ali.*.gz" + exit 1; +fi + +for f in tree cmvn_opts splice_opts num_jobs final.mdl; do + if [ ! -f $src_dir/$f ]; then + echo "copy_ali_dir.sh: no such file $src_dir/$f this might be serious error." + continue + fi + cp $src_dir/$f $dest_dir/ +done + +nj=$(cat $dest_dir/num_jobs) +mkdir -p $dest_dir/temp +cat << EOF > $dest_dir/temp/copy_ali.sh +set -e; +id=\$1 +echo "$src_dir/ali.\$id.gz" +gunzip -c $src_dir/ali.\$id.gz | \ + copy-int-vector ark:- ark,t:- | \ +python -c " +import sys +for line in sys.stdin: + parts = line.split() + print '$utt_prefix{0}$utt_suffix {1}'.format(parts[0], ' '.join(parts[1:])) +" | \ + gzip -c >$dest_dir/ali.\$id.gz || exit 1; +set +o pipefail; # unset the pipefail option. +EOF +chmod +x $dest_dir/temp/copy_ali.sh +$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_ali.JOB.log $dest_dir/temp/copy_ali.sh JOB || exit 1; + +echo "$0: copied alignments from $src_dir to $dest_dir" diff --git a/egs/zeroth_korean/s5/local/nnet2/run_5a_recData01.sh b/egs/zeroth_korean/s5/local/nnet2/run_5a_recData01.sh new file mode 100755 index 00000000000..c7e563906c6 --- /dev/null +++ b/egs/zeroth_korean/s5/local/nnet2/run_5a_recData01.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# This is p-norm neural net training, with the "fast" script, on top of adapted +# 40-dimensional features. + +# Modified by Lucas Jo 2017 (Altas Guide) + + +train_stage=-10 +use_gpu=true + +. cmd.sh +. ./path.sh +. utils/parse_options.sh + + +if $use_gpu; then + if ! cuda-compiled; then + cat < $from/reco2dur + + if [ -f $to/utt2dur ] ; then + rm $to/uttdur + fi + for i in `seq 0 ${num_data_reps}`; do + cat $from/reco2dur | sed -e "s/^/rev${i}_/" >> $to/utt2dur + done + ### + + + for datadir in ${trainset}_rvb${num_data_reps} ; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + done + + # copy the alignments for the newly created utterance ids + ali_dirs= + for i in `seq 0 $num_data_reps`; do + local/multi_condition/copy_ali_dir.sh --cmd "$decode_cmd" --utt-prefix "rev${i}_" ${gmmdir}_ali_${trainset} ${gmmdir}_ali_${trainset}_temp_$i || exit 1; + ali_dirs+=" ${gmmdir}_ali_${trainset}_temp_$i" + done + steps/combine_ali_dirs.sh data/${trainset}_rvb${num_data_reps} ${gmmdir}_ali_${trainset}_rvb $ali_dirs || exit 1; + + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We align a subset of training data for + # this purpose. + utils/subset_data_dir.sh data/${trainset}_rvb${num_data_reps}_hires 100000 data/train_100k_hires + utils/subset_data_dir.sh data/${trainset}_rvb${num_data_reps}_hires 30000 data/train_30k_hires +fi + + +if [ $stage -le 4 ]; then + # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 + # because after we get the transform (12th iter is the last), any further + # training is pointless. + + mkdir exp -p exp/nnet3${rvb_affix} + + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --realign-iters "" \ + --splice-opts "--left-context=3 --right-context=3" \ + 3000 10000 data/train_100k_hires data/lang_nosp \ + ${gmmdir}_ali_${trainset}_rvb exp/nnet3${rvb_affix}/tri2b +fi + + +if [ $stage -le 5 ]; then + # To train a diagonal UBM we don't need very much data, so use a small subset + # (actually, it's not that small: still around 100 hours). + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $nj --num_threads $maxThread --num-frames 700000 \ + data/train_30k_hires 512 exp/nnet3${rvb_affix}/tri2b exp/nnet3${rvb_affix}/diag_ubm +fi + +if [ $stage -le 6 ]; then + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) so we don't use all of it, + # we use just the 3k subset (about one fifth of the data, or 200 hours). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${trainset}_rvb${num_data_reps}_hires exp/nnet3${rvb_affix}/diag_ubm exp/nnet3${rvb_affix}/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + ivectordir=exp/nnet3${rvb_affix}/ivectors_${trainset}_rvb${num_data_reps}_hires + + # We extract iVectors on all the train data, which will be what we train the + # system on. With --utts-per-spk-max 2, the script. pairs the utterances + # into twos, and treats each of these pairs as one speaker. Note that these + # are extracted 'online'. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ + data/${trainset}_rvb${num_data_reps}_hires data/${trainset}_rvb${num_data_reps}_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + data/${trainset}_rvb${num_data_reps}_hires_max2 exp/nnet3${rvb_affix}/extractor $ivectordir || exit 1; +fi + + +exit 0; diff --git a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..116070ab50b --- /dev/null +++ b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +# this script contains some common (shared) parts of the run_nnet*.sh scripts. +. cmd.sh + + +stage=0 +gmmdir=exp/tri4b +speed_perturb=false +trainset=train_clean + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + + for datadir in ${trainset} ; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 + utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 + utils/validate_data_dir.sh --no-feats data/${datadir}_tmp + rm -r data/temp1 data/temp2 + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 \ + data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_tmp + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + utils/fix_data_dir.sh data/${datadir}_sp + rm -r data/temp0 data/${datadir}_tmp + done + fi + + if [ $stage -le 2 ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/${trainset}_sp data/lang_nosp ${gmmdir} ${gmmdir}_ali_${trainset}_sp || exit 1 + fi + trainset=${trainset}_sp +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=mfcc_hires + #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + # utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + #fi + + for datadir in ${trainset} ; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + done + + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We align a subset of training data for + # this purpose. + utils/subset_data_dir.sh data/${trainset}_hires 30000 data/train_30k_hires +fi + + +if [ $stage -le 4 ]; then + # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 + # because after we get the transform (12th iter is the last), any further + # training is pointless. + + mkdir exp -p exp/nnet3 + + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --realign-iters "" \ + --splice-opts "--left-context=3 --right-context=3" \ + 3000 10000 data/${trainset}_hires data/lang_nosp \ + ${gmmdir}_ali_${trainset} exp/nnet3/tri2b +fi + + +if [ $stage -le 5 ]; then + # To train a diagonal UBM we don't need very much data, so use a small subset + # (actually, it's not that small: still around 100 hours). + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \ + data/train_30k_hires 512 exp/nnet3/tri2b exp/nnet3/diag_ubm +fi + +if [ $stage -le 6 ]; then + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) so we don't use all of it, + # we use just the 3k subset (about one fifth of the data, or 200 hours). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${trainset}_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + ivectordir=exp/nnet3/ivectors_${trainset}_hires + + # We extract iVectors on all the train data, which will be what we train the + # system on. With --utts-per-spk-max 2, the script. pairs the utterances + # into twos, and treats each of these pairs as one speaker. Note that these + # are extracted 'online'. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ + data/${trainset}_hires data/${trainset}_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \ + data/${trainset}_hires_max2 exp/nnet3/extractor $ivectordir || exit 1; +fi + + +exit 0; diff --git a/egs/zeroth_korean/s5/local/online/export_online_nnet2_model.sh b/egs/zeroth_korean/s5/local/online/export_online_nnet2_model.sh new file mode 100755 index 00000000000..a9b4a61c6d2 --- /dev/null +++ b/egs/zeroth_korean/s5/local/online/export_online_nnet2_model.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Copyright 2017 Lucas Jo (Atlas Guide) +# Apache 2.0 + +if [ $# -ne "1" ]; then + echo "Usage: $0 " + echo "e.g.: $0 ./export" + exit 1 +fi + +tardir=$1 +srcdir=exp/nnet2_online/nnet_ms_a_online +graphdir=exp/tri5b/graph_tgsmall +oldlang=data/lang_test_tgsmall +newlang=data/lang_test_fglarge +oldlm=$oldlang/G.fst +newlm=$newlang/G.carpa +symtab=$newlang/words.txt + +for f in $srcdir/final.mdl $symtab $graphdir/HCLG.fst $srcdir/conf/mfcc.conf \ + $srcdir/conf/ivector_extractor.conf $oldlm $newlm; do + [ ! -f $f ] && echo "export_model.sh: no such file $f" && exit 1; +done + +mkdir -p $tardir/conf +cp -rpf $srcdir/final.mdl $tardir/final.mdl # acoustic model +cp -rpf $symtab $tardir/words.txt # word symbol table +cp -rpf $graphdir/HCLG.fst $tardir/HCLG.fst # HCLG +cp -rpf $srcdir/conf/mfcc.conf $tardir/conf/mfcc.conf +cp -rpf $srcdir/conf/ivector_extractor.conf $tardir/conf/ivector_extractor.conf +cp -rpf $oldlm $tardir/G.fst +cp -rpf $newlm $tardir/G.carpa diff --git a/egs/zeroth_korean/s5/local/online/run_nnet2_common.sh b/egs/zeroth_korean/s5/local/online/run_nnet2_common.sh new file mode 100755 index 00000000000..d1ac0a2f5d2 --- /dev/null +++ b/egs/zeroth_korean/s5/local/online/run_nnet2_common.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# this script contains some common (shared) parts of the run_nnet*.sh scripts. +# Modified by Lucas Jo 2017 (Altas Guide) +. cmd.sh + + +stage=0 + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if [ $stage -le 1 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=mfcc + #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + # utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + #fi + + for datadir in train_2x; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + done + + # now create some data subsets. + # mixed is the clean+other data. + # a is 1/5 of the data, b is 2/5th of it. + utils/subset_data_dir.sh data/train_2x_hires 3000 data/train_mixed_hires_a + utils/subset_data_dir.sh data/train_2x_hires 6000 data/train_mixed_hires_b +fi + +if [ $stage -le 2 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We align a subset of training data for + # this purpose. + utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_mixed_hires_a/utt2spk) \ + data/train_2x data/train_2x_a + + steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ + data/train_2x_a data/lang exp/tri5b exp/nnet2_online/tri5b_ali_a +fi + +if [ $stage -le 3 ]; then + # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 + # because after we get the transform (12th iter is the last), any further + # training is pointless. + #5000 10000 data/train_mixed_hires_a data/lang \ + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --realign-iters "" \ + --splice-opts "--left-context=3 --right-context=3" \ + 3000 20000 data/train_mixed_hires_a data/lang \ + exp/nnet2_online/tri5b_ali_a exp/nnet2_online/tri6b +fi + + +if [ $stage -le 4 ]; then + mkdir -p exp/nnet2_online + # To train a diagonal UBM we don't need very much data, so use a small subset + # (actually, it's not that small: still around 100 hours). + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \ + data/train_mixed_hires_a 256 exp/nnet2_online/tri6b exp/nnet2_online/diag_ubm +fi + +if [ $stage -le 5 ]; then + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) so we don't use all of it, + # we use just the 3k subset (about one fifth of the data, or 200 hours). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_mixed_hires_b exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1; +fi + +if [ $stage -le 6 ]; then + ivectordir=exp/nnet2_online/ivectors_train_2x_hires + #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then + # utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage + #fi + + # We extract iVectors on all the train data, which will be what we train the + # system on. With --utts-per-spk-max 2, the script. pairs the utterances + # into twos, and treats each of these pairs as one speaker. Note that these + # are extracted 'online'. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_2x_hires data/train_2x_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \ + data/train_2x_hires_max2 exp/nnet2_online/extractor $ivectordir || exit 1; +fi + + +exit 0; diff --git a/egs/zeroth_korean/s5/local/online/run_nnet2_ms.sh b/egs/zeroth_korean/s5/local/online/run_nnet2_ms.sh new file mode 100755 index 00000000000..d46e2f63667 --- /dev/null +++ b/egs/zeroth_korean/s5/local/online/run_nnet2_ms.sh @@ -0,0 +1,267 @@ +#!/bin/bash + +# This is the "multi-splice" version of the online-nnet2 training script. +# It's currently the best recipe. +# You'll notice that we splice over successively larger windows as we go deeper +# into the network. + +# Modified by Lucas Jo 2017 (Altas Guide) + +. cmd.sh + + +stage=0 +train_stage=-10 +use_gpu=true +dir=exp/nnet2_online/nnet_ms_a +exit_train_stage=-100 + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if $use_gpu; then + if ! cuda-compiled; then + cat < " + echo "e.g.: /data/local/lm data/local/dict_nosp" + exit 1 +fi +lm_dir=$1 +dst_dir=$2 + +mkdir -p $dst_dir || exit 1; + +# this file is a copy of the lexicon we obtained from download_lm.sh process +lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt + +if [[ ! -s "$lexicon_raw_nosil" ]]; then + cp $lm_dir/zeroth_lexicon $lexicon_raw_nosil || exit 1 +fi + +silence_phones=$dst_dir/silence_phones.txt +optional_silence=$dst_dir/optional_silence.txt +nonsil_phones=$dst_dir/nonsilence_phones.txt +extra_questions=$dst_dir/extra_questions.txt + +echo "Preparing phone lists and clustering questions" +(echo SIL; echo SPN;) > $silence_phones +#( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $silence_phones +echo SIL > $optional_silence +# nonsilence phones; on each line is a list of phones that correspond +# really to the same base phone. +awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\ + sort -u |\ + perl -e 'while(<>){ +chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; +$phones_of{$1} .= "$_ "; } +foreach $list (values %phones_of) {print $list . "\n"; } ' \ + > $nonsil_phones || exit 1; +# A few extra questions that will be added to those obtained by +# automatically clustering +# the "real" phones. These ask about stress; there's also one for +# silence. +cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1; +cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)){ + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $extra_questions || exit 1; + +echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones" +echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence" +echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones" +echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions" + +#(echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH'; +# echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH' +# echo ' NSN' ) | \ +(echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) |\ +cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt +echo "Lexicon text file saved as: $dst_dir/lexicon.txt" +exit 0 + diff --git a/egs/zeroth_korean/s5/local/score.sh b/egs/zeroth_korean/s5/local/score.sh new file mode 100755 index 00000000000..c812199fc98 --- /dev/null +++ b/egs/zeroth_korean/s5/local/score.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# 2014 Guoguo Chen +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt + +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab \ + ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1; +done + +# Note: the double level of quoting for the sed command +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \ + cat $dir/scoring/LMWT.$wip.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; +done + +exit 0; diff --git a/egs/zeroth_korean/s5/local/updateSegmentation.sh b/egs/zeroth_korean/s5/local/updateSegmentation.sh new file mode 100755 index 00000000000..e892f902837 --- /dev/null +++ b/egs/zeroth_korean/s5/local/updateSegmentation.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright 2017 Lucas Jo (Atlas Guide) +# Apache 2.0 + +# do this when the segmentation rule is changed +dataDir=$1 +lmDir=$2 + +exists(){ + command -v "$1" >/dev/null 2>&1 +} + +# check morfessor installation +if ! exists morfessor; then + echo "Please, install Morfessor" + exit 1 +fi + +trans=$dataDir/text +echo "Re-segment transcripts: $trans --------------------------------------------" +if [ ! -f $trans ]; then + echo "transcription file is not found in "$dataDir + exit 1 +fi +cp $trans $trans".old" +awk '{print $1}' $trans".old" > $trans"_tmp_index" +cut -d' ' -f2- $trans".old" |\ + sed -E 's/\s+/ /g; s/^\s//g; s/\s$//g' |\ + morfessor -l $lmDir/zeroth_morfessor.seg -T - -o - \ + --output-format '{analysis} ' --output-newlines \ + --nosplit-re '[0-9\[\]\(\){}a-zA-Z&.,\-]+' \ + | paste -d" " $trans"_tmp_index" - > $trans +rm -f $trans"_tmp_index" + +#transcripList=$(find $dataDir -name "*.norm.txt" -type f | sort) +#for transcript in $transcripList; +#do +# echo "read: " $transcript +# cat $transcript | awk '{print $1;}' > tmp +# cat $transcript | awk '{$1="";print $0;}' | \ +# local/strip.py | \ +# #morfessor -l $lmDir/data/_lexicon_/mergedCorpus.model4.reduced -T - -o tmp2 --output-format '{analysis} ' --output-newlines +# morfessor -l $lmDir/zeroth_morfessor.seg -T - -o tmp2 --output-format '{analysis} ' --output-newlines +# #$lmDir/data/_lm_/seg2sentence.py tmp2 > tmp3 +# +# array=(${transcript//\./ }) +# echo "write: " ${array[0]}.${array[1]}.txt +# paste -d" " tmp tmp2 > ${array[0]}.${array[1]}.txt +#done +#rm -f tmp* diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh new file mode 100755 index 00000000000..48a3834050c --- /dev/null +++ b/egs/zeroth_korean/s5/run.sh @@ -0,0 +1,194 @@ +#!/bin/bash +# +# Based mostly on the WSJ/Librispeech recipe. The training database is #####, +# it consists of 51hrs korean speech with cleaned automatic transcripts: +# +# http://www.openslr.org/resources (Mirror). +# +# Copyright 2018 Atlas Guide (Author : Lucas Jo) +# 2018 Gridspace Inc. (Author: Wonkyum Lee) +# +# Apache 2.0 +# + +# Check list before start +# 1. locale setup +# 2. pre-installed package: awscli, Morfessor-2.0.1, flac, sox, same cuda library, unzip +# 3. pre-install or symbolic link for easy going: rirs_noises.zip (takes pretty long time) +# 4. parameters: nCPU, num_jobs_initial, num_jobs_final, --max-noises-per-minute + +db_dir=./db +nCPU=16 + +. ./cmd.sh +. ./path.sh + +# you might not want to do this for interactive shells. +set -e + +startTime=$(date +'%F-%H-%M') +echo "started at" $startTime + +# download the data. +local/download_and_untar.sh $db_dir + +# format the data as Kaldi data directories +for part in train_data_01 test_data_01; do + # use underscore-separated names in data directories. + local/data_prep.sh $db_dir/$part data/$(echo $part | sed s/-/_/g) +done + +# update segmentation of transcripts +for part in train_data_01 test_data_01; do + local/updateSegmentation.sh data/$part data/local/lm +done + +# prepare dictionary and language model +local/prepare_dict.sh data/local/lm data/local/dict_nosp + +utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_tmp_nosp data/lang_nosp + +local/format_lms.sh --src-dir data/lang_nosp data/local/lm + +# Create ConstArpaLm format language model for full 3-gram and 4-gram LMs +# it takes long time and do this again after computing silence prob. +# you can do comment out here this time + +#utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.tg.arpa.gz \ +# data/lang_nosp data/lang_nosp_test_tglarge +#utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.fg.arpa.gz \ +# data/lang_nosp data/lang_nosp_test_fglarge + +# Feature extraction (MFCC) +mfccdir=mfcc +hostInAtlas="ares hephaestus jupiter neptune" +if [[ ! -z $(echo $hostInAtlas | grep -o $(hostname -f)) ]]; then + mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename. + utils/create_split_dir.pl /mnt/{ares,hephaestus,jupiter,neptune}/$USER/kaldi-data/zeroth-kaldi/s5/$mfcc/storage \ + $mfccdir/storage +fi +for part in train_data_01 test_data_01; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nCPU data/$part exp/make_mfcc/$part $mfccdir + steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir +done + +# ... and then combine data sets into one (for later extension) +utils/combine_data.sh \ + data/train_clean data/train_data_01 + +utils/combine_data.sh \ + data/test_clean data/test_data_01 + +# Make some small data subsets for early system-build stages. +utils/subset_data_dir.sh --shortest data/train_clean 2000 data/train_2kshort +utils/subset_data_dir.sh data/train_clean 5000 data/train_5k +utils/subset_data_dir.sh data/train_clean 10000 data/train_10k + +echo "#### Monophone Training ###########" +# train a monophone system & align +steps/train_mono.sh --boost-silence 1.25 --nj $nCPU --cmd "$train_cmd" \ + data/train_2kshort data/lang_nosp exp/mono +steps/align_si.sh --boost-silence 1.25 --nj $nCPU --cmd "$train_cmd" \ + data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k + +echo "#### Triphone Training, delta + delta-delta ###########" +# train a first delta + delta-delta triphone system on a subset of 5000 utterancesa +# number of maximum pdf, gaussian (under/over fitting) +# recognition result +steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 +steps/align_si.sh --nj $nCPU --cmd "$train_cmd" \ + data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k + +echo "#### Triphone Training, LDA+MLLT ###########" +# train an LDA+MLLT system. +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ + data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b + +# Align a 10k utts subset using the tri2b model +steps/align_si.sh --nj $nCPU --cmd "$train_cmd" --use-graphs true \ + data/train_clean data/lang_nosp exp/tri2b exp/tri2b_ali_train_clean + +echo "#### Triphone Training, LDA+MLLT+SAT ###########" +# Train tri3b, which is LDA+MLLT+SAT on 10k utts +#steps/train_sat.sh --cmd "$train_cmd" 3000 25000 \ +steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train_clean data/lang_nosp exp/tri2b_ali_train_clean exp/tri3b + +# Now we compute the pronunciation and silence probabilities from training data, +# and re-create the lang directory. +# silence transition probability ... +steps/get_prons.sh --cmd "$train_cmd" \ + data/train_clean data/lang_nosp exp/tri3b + +utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp \ + exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \ + exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict + +utils/prepare_lang.sh data/local/dict \ + "" data/local/lang_tmp data/lang + +local/format_lms.sh --src-dir data/lang data/local/lm + +utils/build_const_arpa_lm.sh \ + data/local/lm/zeroth.lm.tg.arpa.gz data/lang data/lang_test_tglarge +utils/build_const_arpa_lm.sh \ + data/local/lm/zeroth.lm.fg.arpa.gz data/lang data/lang_test_fglarge + +# align the entire train_clean using the tri3b model +steps/align_fmllr.sh --nj $nCPU --cmd "$train_cmd" \ + data/train_clean data/lang exp/tri3b exp/tri3b_ali_train_clean + +echo "#### SAT again on train_clean ###########" +# train another LDA+MLLT+SAT system on the entire subset +steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train_clean data/lang exp/tri3b_ali_train_clean exp/tri4b + +# decode using the tri4b model with pronunciation and silence probabilities +utils/mkgraph.sh \ + data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall + +# the size is properly set? +utils/subset_data_dir.sh data/test_clean 200 data/test_200 + +for test in test_200; do + nspk=$(wc -l Date: Mon, 9 Jul 2018 20:01:47 -0700 Subject: [PATCH 03/26] cleaning script - simplified the script - delete unnecessary scripts and comments --- egs/zeroth_korean/s5/RESULTS | 24 -- egs/zeroth_korean/s5/conf/queue.conf | 10 - .../chain/multi_condition/run_tdnn_1b.sh | 299 ------------- .../chain/multi_condition/run_tdnn_1n.sh | 302 ------------- .../chain/multi_condition/run_tdnn_lstm_1e.sh | 328 --------------- egs/zeroth_korean/s5/local/chain/run_tdnn.sh | 1 + .../s5/local/chain/run_tdnn_1b.sh | 271 ------------ .../s5/local/chain/run_tdnn_1n.sh | 275 ------------ .../s5/local/chain/run_tdnn_lstm_1e.sh | 290 ------------- .../s5/local/chain/run_tdnn_opgru.sh | 1 + .../local/chain/{ => tuning}/run_tdnn_1a.sh | 87 ++-- .../run_tdnn_opgru_1a.sh} | 6 +- egs/zeroth_korean/s5/local/data_prep.sh | 10 +- .../s5/local/download_and_untar.sh | 2 +- .../s5/local/multi_condition/copy_ali_dir.sh | 78 ---- .../s5/local/nnet2/run_5a_recData01.sh | 76 ---- .../s5/local/nnet2/run_5a_train_2x.sh | 105 ----- .../s5/local/nnet2/run_5a_train_clean.sh | 77 ---- .../multi_condition/run_ivector_common.sh | 214 ---------- .../s5/local/nnet3/run_ivector_common.sh | 12 +- .../local/online/export_online_nnet2_model.sh | 33 -- .../s5/local/online/run_nnet2_common.sh | 101 ----- .../s5/local/online/run_nnet2_ms.sh | 267 ------------ .../s5/local/online/run_nnet2_ms_disc.sh | 164 -------- .../s5/local/updateSegmentation.sh | 19 +- egs/zeroth_korean/s5/run.sh | 397 +++++++++++------- 26 files changed, 297 insertions(+), 3152 deletions(-) delete mode 100644 egs/zeroth_korean/s5/RESULTS delete mode 100644 egs/zeroth_korean/s5/conf/queue.conf delete mode 100755 egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1b.sh delete mode 100755 egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1n.sh delete mode 100755 egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_lstm_1e.sh create mode 120000 egs/zeroth_korean/s5/local/chain/run_tdnn.sh delete mode 100755 egs/zeroth_korean/s5/local/chain/run_tdnn_1b.sh delete mode 100755 egs/zeroth_korean/s5/local/chain/run_tdnn_1n.sh delete mode 100755 egs/zeroth_korean/s5/local/chain/run_tdnn_lstm_1e.sh create mode 120000 egs/zeroth_korean/s5/local/chain/run_tdnn_opgru.sh rename egs/zeroth_korean/s5/local/chain/{ => tuning}/run_tdnn_1a.sh (77%) rename egs/zeroth_korean/s5/local/chain/{run_tdnn_opgru_1c.sh => tuning/run_tdnn_opgru_1a.sh} (98%) delete mode 100755 egs/zeroth_korean/s5/local/multi_condition/copy_ali_dir.sh delete mode 100755 egs/zeroth_korean/s5/local/nnet2/run_5a_recData01.sh delete mode 100755 egs/zeroth_korean/s5/local/nnet2/run_5a_train_2x.sh delete mode 100755 egs/zeroth_korean/s5/local/nnet2/run_5a_train_clean.sh delete mode 100755 egs/zeroth_korean/s5/local/nnet3/multi_condition/run_ivector_common.sh delete mode 100755 egs/zeroth_korean/s5/local/online/export_online_nnet2_model.sh delete mode 100755 egs/zeroth_korean/s5/local/online/run_nnet2_common.sh delete mode 100755 egs/zeroth_korean/s5/local/online/run_nnet2_ms.sh delete mode 100755 egs/zeroth_korean/s5/local/online/run_nnet2_ms_disc.sh diff --git a/egs/zeroth_korean/s5/RESULTS b/egs/zeroth_korean/s5/RESULTS deleted file mode 100644 index 8a189e3f501..00000000000 --- a/egs/zeroth_korean/s5/RESULTS +++ /dev/null @@ -1,24 +0,0 @@ -# In the results below, "tgsmall" is the pruned 3-gram LM, which is used for lattice generation. -# The following language models are then used for rescoring: -# a) tglarge- the full, non-pruned 3-gram LM -# b) fglarge- non-pruned 4-gram LM -# The "test-clean" sets generally contain, relatively cleaner Korean speech, -# the "test_200" are subset of "test-clean", designed for quick evaluation - -### SAT GMM model trained on the "train-01" set (51 hours "clean" speech) -decode_fglarge_test_200/wer_14_0.5:%WER 21.17 [ 873 / 4124, 93 ins, 172 del, 608 sub ] -decode_tglarge_test_200/wer_15_0.0:%WER 21.46 [ 885 / 4124, 101 ins, 168 del, 616 sub ] -decode_tgsmall_test_200/wer_14_0.5:%WER 33.83 [ 1395 / 4124, 85 ins, 330 del, 980 sub ] -decode_tgsmall_test_200.si/wer_14_0.0:%WER 46.02 [ 1898 / 4124, 133 ins, 389 del, 1376 sub ] - -### Chain model trained on the "train-01" set -tdnn1n_online/decode_fglarge_test_200/wer_13_1.0:%WER 11.25 [ 464 / 4124, 65 ins, 78 del, 321 sub ] -tdnn1n_online/decode_tgsmall_test_200/wer_13_0.0:%WER 18.09 [ 746 / 4124, 89 ins, 123 del, 534 sub ] -tdnn_opgru_1c_sp_online/decode_fglarge_test_200/wer_8_1.0:%WER 9.00 [ 371 / 4124, 50 ins, 63 del, 258 sub ] -tdnn_opgru_1c_sp_online/decode_tgsmall_test_200/wer_8_0.5:%WER 14.06 [ 580 / 4124, 62 ins, 92 del, 426 sub ] - -### Chain model trained on the "train-01" set with multi-conditioned data augmentation -tdnn1n_rvb_online/decode_fglarge_test_200/wer_10_0.0:%WER 10.11 [ 417 / 4124, 73 ins, 57 del, 287 sub ] -tdnn1n_rvb_online/decode_tgsmall_test_200/wer_8_0.5:%WER 16.27 [ 671 / 4124, 87 ins, 91 del, 493 sub ] -tdnn_lstm_1e_rvb_online/decode_fglarge_test_200/wer_13_0.0:%WER 11.47 [ 473 / 4124, 74 ins, 61 del, 338 sub ] -tdnn_lstm_1e_rvb_online/decode_tgsmall_test_200/wer_12_1.0:%WER 16.97 [ 700 / 4124, 72 ins, 130 del, 498 sub ] diff --git a/egs/zeroth_korean/s5/conf/queue.conf b/egs/zeroth_korean/s5/conf/queue.conf deleted file mode 100644 index 2aa9ee6a211..00000000000 --- a/egs/zeroth_korean/s5/conf/queue.conf +++ /dev/null @@ -1,10 +0,0 @@ -# Default configuration -command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -option mem=* -l mem_free=$0,ram_free=$0 -option mem=0 # Do not add anything to qsub_opts -option num_threads=* -pe smp $0 -option num_threads=1 # Do not add anything to qsub_opts -option max_jobs_run=* -tc $0 -default gpu=0 -option gpu=0 -option gpu=* -l gpu=$0 diff --git a/egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1b.sh b/egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1b.sh deleted file mode 100755 index c8ebaeb2e05..00000000000 --- a/egs/zeroth_korean/s5/local/chain/multi_condition/run_tdnn_1b.sh +++ /dev/null @@ -1,299 +0,0 @@ -#!/bin/bash - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -train_set=train_clean -num_data_reps=1 # number of reverberated copies of data to generate -speed_perturb=true -test_sets="test_200" -gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it - # should have alignments for the specified training data. -nnet3_affix=_rvb # affix for exp dirs, e.g. it was _cleaned in tedlium. - -# Options which are not passed through to run_ivector_common.sh -affix=1b_rvb #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration. -common_egs_dir= - -# LSTM/chain options -train_stage=-10 -xent_regularize=0.1 -max_param_change=2.0 - -# training chunk-options -chunk_width=150 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 - -# training options -srand=0 -num_jobs_initial=2 -num_jobs_final=12 -num_epochs=4 -minibatch_size=128 -initial_effective_lrate=0.001 -final_effective_lrate=0.0001 -remove_egs=true - - -#decode options -test_online_decoding=true # if true, it will run the last decoding stage. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <$lang/topo - fi -fi - -if [ $stage -le 9 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $clean_lat_dir - rm $clean_lat_dir/fsts.*.gz # save space - # Create the lattices for the reverberated data - - # We use the lattices/alignments from the clean data for the reverberated data. - mkdir -p $lat_dir/temp/ - lattice-copy "ark:gunzip -c $clean_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp - - # copy the lattices for the reverberated data - rm -f $lat_dir/temp/combined_lats.scp - touch $lat_dir/temp/combined_lats.scp - # Here prefix "rev0_" represents the clean set, "rev1_" represents the reverberated set - for i in `seq 0 $num_data_reps`; do - cat $lat_dir/temp/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp - done - sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp - - lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; - echo "1" > $lat_dir/num_jobs - - # copy other files from original lattice dir - for f in cmvn_opts final.mdl splice_opts tree; do - cp $clean_lat_dir/$f $lat_dir/$f - done - -fi - -if [ $stage -le 10 ]; then - # Build a tree using our new topology. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ - $lang $ali_dir $tree_dir -fi - -if [ $stage -le 11 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) - relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) - relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) - - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 12 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.srand $srand \ - --trainer.max-param-change $max_param_change \ - --trainer.num-epochs $num_epochs \ - --trainer.frames-per-iter 1500000 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --egs.chunk-width $chunk_width \ - --egs.chunk-left-context $chunk_left_context \ - --egs.chunk-right-context $chunk_right_context \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --cleanup.remove-egs $remove_egs \ - --use-gpu true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 13 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/lang/check_phones_compatible.sh \ - data/lang_test_tgsmall/phones.txt $lang/phones.txt - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test_tgsmall \ - $tree_dir $tree_dir/graph_tgsmall || exit 1; -fi - -if $test_online_decoding && [ $stage -le 14 ]; then - # note: if the features change (e.g. you add pitch features), you will have to - # change the options of the following command line. - steps/online/nnet3/prepare_online_decoding.sh \ - --mfcc-config conf/mfcc_hires.conf \ - $lang exp/nnet3_rvb/extractor ${dir} ${dir}_online - - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l $lang/topo - fi -fi - -if [ $stage -le 9 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $clean_lat_dir - rm $clean_lat_dir/fsts.*.gz # save space - # Create the lattices for the reverberated data - - # We use the lattices/alignments from the clean data for the reverberated data. - mkdir -p $lat_dir/temp/ - lattice-copy "ark:gunzip -c $clean_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp - - # copy the lattices for the reverberated data - rm -f $lat_dir/temp/combined_lats.scp - touch $lat_dir/temp/combined_lats.scp - # Here prefix "rev0_" represents the clean set, "rev1_" represents the reverberated set - for i in `seq 0 $num_data_reps`; do - cat $lat_dir/temp/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp - done - sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp - - lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; - echo "1" > $lat_dir/num_jobs - - # copy other files from original lattice dir - for f in cmvn_opts final.mdl splice_opts tree; do - cp $clean_lat_dir/$f $lat_dir/$f - done - -fi - -if [ $stage -le 10 ]; then - # Build a tree using our new topology. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ - $lang $ali_dir $tree_dir -fi - -if [ $stage -le 11 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - opts="l2-regularize=0.002" - linear_opts="orthonormal-constraint=1.0" - output_opts="l2-regularize=0.0005 bottleneck-dim=256" - - mkdir -p $dir/configs - - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 $opts dim=1280 - linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn3l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn3 $opts dim=1280 - linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn5l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) - linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 - linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 - linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 - linear-component name=prefinal-l dim=256 $linear_opts - - relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 - output-layer name=output include-log-softmax=false dim=$num_targets $output_opts - - relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 12 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.0 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.max-param-change $max_param_change \ - --trainer.num-epochs $num_epochs \ - --trainer.frames-per-iter 1500000 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --egs.stage $get_egs_stage \ - --egs.chunk-width $chunk_width \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --cleanup.remove-egs $remove_egs \ - --use-gpu true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 13 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/lang/check_phones_compatible.sh \ - data/lang_test_tgsmall/phones.txt $lang/phones.txt - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test_tgsmall \ - $tree_dir $tree_dir/graph_tgsmall || exit 1; -fi - -if $test_online_decoding && [ $stage -le 14 ]; then - # note: if the features change (e.g. you add pitch features), you will have to - # change the options of the following command line. - steps/online/nnet3/prepare_online_decoding.sh \ - --mfcc-config conf/mfcc_hires.conf \ - $lang exp/nnet3_rvb/extractor ${dir} ${dir}_online - - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l $lang/topo - fi -fi - -if [ $stage -le 9 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $clean_lat_dir - rm $clean_lat_dir/fsts.*.gz # save space - # Create the lattices for the reverberated data - - # We use the lattices/alignments from the clean data for the reverberated data. - mkdir -p $lat_dir/temp/ - lattice-copy "ark:gunzip -c $clean_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp - - # copy the lattices for the reverberated data - rm -f $lat_dir/temp/combined_lats.scp - touch $lat_dir/temp/combined_lats.scp - # Here prefix "rev0_" represents the clean set, "rev1_" represents the reverberated set - for i in `seq 0 $num_data_reps`; do - cat $lat_dir/temp/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp - done - sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp - - lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; - echo "1" > $lat_dir/num_jobs - - # copy other files from original lattice dir - for f in cmvn_opts final.mdl splice_opts tree; do - cp $clean_lat_dir/$f $lat_dir/$f - done - -fi - -if [ $stage -le 10 ]; then - # Build a tree using our new topology. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ - $lang $ali_dir $tree_dir -fi - -if [ $stage -le 11 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - lstm_opts="decay-time=20" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=1024 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 - - # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults - fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 - fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 - fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - - ## adding the layers for chain branch - output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ - -fi - - -if [ $stage -le 12 ]; then - - hostInAtlas="ares hephaestus jupiter neptune" - if [[ ! -z $(echo $hostInAtlas | grep -o $(hostname -f)) ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl /mnt/{ares,hephaestus,jupiter,neptune}/$USER/kaldi-data/zeroth-kaldi-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage \ - $dir/egs/storage - fi - #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - # utils/create_split_dir.pl \ - # /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - #fi - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.num-chunk-per-minibatch 64,32 \ - --trainer.frames-per-iter 1500000 \ - --trainer.max-param-change $max_param_change \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 8 \ - --egs.stage $get_egs_stage \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width $frames_per_chunk \ - --egs.chunk-left-context $chunk_left_context \ - --egs.chunk-right-context $chunk_right_context \ - --egs.chunk-left-context-initial 0 \ - --egs.chunk-right-context-final 0 \ - --egs.dir "$common_egs_dir" \ - --cleanup.remove-egs $remove_egs \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 13 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/lang/check_phones_compatible.sh \ - data/lang_test_tgsmall/phones.txt $lang/phones.txt - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test_tgsmall \ - $tree_dir $tree_dir/graph_tgsmall || exit 1; -fi - -if $test_online_decoding && [ $stage -le 14 ]; then - # note: if the features change (e.g. you add pitch features), you will have to - # change the options of the following command line. - steps/online/nnet3/prepare_online_decoding.sh \ - --mfcc-config conf/mfcc_hires.conf \ - $lang exp/nnet3_rvb/extractor ${dir} ${dir}_online - - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l $lang/topo - fi -fi - -if [ $stage -le 9 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 10 ]; then - # Build a tree using our new topology. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ - $lang $ali_dir $tree_dir -fi - -if [ $stage -le 11 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) - relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) - relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) - - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 12 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.srand $srand \ - --trainer.max-param-change $max_param_change \ - --trainer.num-epochs $num_epochs \ - --trainer.frames-per-iter 1500000 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --egs.chunk-width $chunk_width \ - --egs.chunk-left-context $chunk_left_context \ - --egs.chunk-right-context $chunk_right_context \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --cleanup.remove-egs $remove_egs \ - --use-gpu true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 13 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/lang/check_phones_compatible.sh \ - data/lang_test_tgsmall/phones.txt $lang/phones.txt - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test_tgsmall \ - $tree_dir $tree_dir/graph_tgsmall || exit 1; -fi - -if $test_online_decoding && [ $stage -le 14 ]; then - # note: if the features change (e.g. you add pitch features), you will have to - # change the options of the following command line. - steps/online/nnet3/prepare_online_decoding.sh \ - --mfcc-config conf/mfcc_hires.conf \ - $lang exp/nnet3/extractor ${dir} ${dir}_online - - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l $lang/topo - fi -fi - -if [ $stage -le 9 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 10 ]; then - # Build a tree using our new topology. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ - $lang $ali_dir $tree_dir -fi - -if [ $stage -le 11 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - opts="l2-regularize=0.002" - linear_opts="orthonormal-constraint=1.0" - output_opts="l2-regularize=0.0005 bottleneck-dim=256" - - mkdir -p $dir/configs - - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 $opts dim=1280 - linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn3l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn3 $opts dim=1280 - linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn5l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) - linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 - linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 - linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 - linear-component name=prefinal-l dim=256 $linear_opts - - relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 - output-layer name=output include-log-softmax=false dim=$num_targets $output_opts - - relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 12 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.0 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.max-param-change $max_param_change \ - --trainer.num-epochs $num_epochs \ - --trainer.frames-per-iter 1500000 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --egs.stage $get_egs_stage \ - --egs.chunk-width $chunk_width \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --cleanup.remove-egs $remove_egs \ - --use-gpu true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 13 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/lang/check_phones_compatible.sh \ - data/lang_test_tgsmall/phones.txt $lang/phones.txt - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test_tgsmall \ - $tree_dir $tree_dir/graph_tgsmall || exit 1; -fi - -if $test_online_decoding && [ $stage -le 14 ]; then - # note: if the features change (e.g. you add pitch features), you will have to - # change the options of the following command line. - steps/online/nnet3/prepare_online_decoding.sh \ - --mfcc-config conf/mfcc_hires.conf \ - $lang exp/nnet3/extractor ${dir} ${dir}_online - - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l $lang/topo - fi -fi - -if [ $stage -le 9 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 10 ]; then - # Build a tree using our new topology. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ - $lang $ali_dir $tree_dir -fi - -if [ $stage -le 11 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - lstm_opts="decay-time=20" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=1024 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 - - # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults - fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 - fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 - fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - - ## adding the layers for chain branch - output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ - -fi - - -if [ $stage -le 12 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.num-chunk-per-minibatch 64,32 \ - --trainer.frames-per-iter 1500000 \ - --trainer.max-param-change $max_param_change \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 8 \ - --egs.stage $get_egs_stage \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width $frames_per_chunk \ - --egs.chunk-left-context $chunk_left_context \ - --egs.chunk-right-context $chunk_right_context \ - --egs.chunk-left-context-initial 0 \ - --egs.chunk-right-context-final 0 \ - --egs.dir "$common_egs_dir" \ - --cleanup.remove-egs $remove_egs \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 13 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/lang/check_phones_compatible.sh \ - data/lang_test_tgsmall/phones.txt $lang/phones.txt - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test_tgsmall \ - $tree_dir $tree_dir/graph_tgsmall || exit 1; -fi - -if $test_online_decoding && [ $stage -le 14 ]; then - # note: if the features change (e.g. you add pitch features), you will have to - # change the options of the following command line. - steps/online/nnet3/prepare_online_decoding.sh \ - --mfcc-config conf/mfcc_hires.conf \ - $lang exp/nnet3/extractor ${dir} ${dir}_online - - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - data_affix=$(echo $data | sed s/test_//) - nspk=$(wc -l $dir/configs/network.xconfig input dim=100 name=ivector input dim=40 name=input @@ -150,31 +155,37 @@ if [ $stage -le 11 ]; then # please note that it is important to have input layer with the name=input # as the layer immediately preceding the fixed-affine-layer to enable # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) - relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) - relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) - relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) - - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi @@ -192,10 +203,9 @@ if [ $stage -le 12 ]; then --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ + --chain.l2-regularize 0.0 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.srand $srand \ --trainer.max-param-change $max_param_change \ --trainer.num-epochs $num_epochs \ --trainer.frames-per-iter 1500000 \ @@ -204,9 +214,8 @@ if [ $stage -le 12 ]; then --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ --trainer.optimization.final-effective-lrate $final_effective_lrate \ --trainer.num-chunk-per-minibatch $minibatch_size \ + --egs.stage $get_egs_stage \ --egs.chunk-width $chunk_width \ - --egs.chunk-left-context $chunk_left_context \ - --egs.chunk-right-context $chunk_right_context \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ --cleanup.remove-egs $remove_egs \ diff --git a/egs/zeroth_korean/s5/local/chain/run_tdnn_opgru_1c.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh similarity index 98% rename from egs/zeroth_korean/s5/local/chain/run_tdnn_opgru_1c.sh rename to egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh index 6b0817c3b37..e0404cd3d7c 100755 --- a/egs/zeroth_korean/s5/local/chain/run_tdnn_opgru_1c.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -8,13 +8,13 @@ stage=0 nj=30 train_set=train_clean speed_perturb=true -test_sets="test_200" -gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it +test_sets="test_clean" +gmm=tri4 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. # Options which are not passed through to run_ivector_common.sh -affix=1c #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration. +affix=1a #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration. common_egs_dir= # OPGRU/chain options diff --git a/egs/zeroth_korean/s5/local/data_prep.sh b/egs/zeroth_korean/s5/local/data_prep.sh index 723028afb35..5e6a7d02ce6 100755 --- a/egs/zeroth_korean/s5/local/data_prep.sh +++ b/egs/zeroth_korean/s5/local/data_prep.sh @@ -1,14 +1,14 @@ #!/bin/bash -# Copyright 2014 Vassil Panayotov -# 2014 Johns Hopkins University (author: Daniel Povey) +# Copyright 2018 Atlas Guide (Author : Lucas Jo) +# 2018 Gridspace Inc. (Author: Wonkyum Lee) # Apache 2.0 # Modified by Lucas Jo 2017 (Altas Guide) if [ "$#" -ne 2 ]; then echo "Usage: $0 " - echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" + echo "e.g.: $0 ./db/train_data_01 data/train_data_01" exit 1 fi @@ -63,9 +63,7 @@ for scriptid_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do [ ! -f $reader_trans ] && echo "$0: expected file $reader_trans to exist" && exit 1 cat $reader_trans >>$trans - # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered - # to be a different speaker. This is done for simplicity and because we want - # e.g. the CMVN to be calculated per-chapter + # NOTE: Each chapter is dedicated to each speaker. awk -v "reader=$reader" -v "scriptid=$scriptid" '{printf "%s %s_%s\n", $1, reader, scriptid}' \ <$reader_trans >>$utt2spk || exit 1 diff --git a/egs/zeroth_korean/s5/local/download_and_untar.sh b/egs/zeroth_korean/s5/local/download_and_untar.sh index 0b56bcb37b3..2e62a3273d4 100755 --- a/egs/zeroth_korean/s5/local/download_and_untar.sh +++ b/egs/zeroth_korean/s5/local/download_and_untar.sh @@ -26,7 +26,7 @@ if [ ! -f $dir/db.tar.gz ]; then if [ ! -d $dir ]; then mkdir -p $dir fi - wget -O $dir/db.tar.gz https://storage.googleapis.com/zeroth_project/zeroth_korean.tar.gz + wget -O $dir/db.tar.gz http://www.openslr.org/resources/40/zeroth_korean.tar.gz else echo " $dir/db.tar.gz already exist" fi diff --git a/egs/zeroth_korean/s5/local/multi_condition/copy_ali_dir.sh b/egs/zeroth_korean/s5/local/multi_condition/copy_ali_dir.sh deleted file mode 100755 index 42ea2dc4b9d..00000000000 --- a/egs/zeroth_korean/s5/local/multi_condition/copy_ali_dir.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Vijayaditya Peddinti) -# Apache 2.0 - -# This script operates on a directory, such as in exp/tri4a_ali, -# that contains some subset of the following files: -# ali.*.gz -# tree -# cmvn_opts -# splice_opts -# num_jobs -# final.mdl -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance names. - - -# begin configuration section -utt_prefix= -utt_suffix= -cmd=run.pl -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --utt-prefix=1- exp/tri4a_ali exp/tri4a_rev1_ali" - echo "Options" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -src_dir=$1 -dest_dir=$2 - -mkdir -p $dest_dir - -if [ ! -f $src_dir/ali.1.gz ]; then - echo "copy_ali_dir.sh: no such files $src_dir/ali.*.gz" - exit 1; -fi - -for f in tree cmvn_opts splice_opts num_jobs final.mdl; do - if [ ! -f $src_dir/$f ]; then - echo "copy_ali_dir.sh: no such file $src_dir/$f this might be serious error." - continue - fi - cp $src_dir/$f $dest_dir/ -done - -nj=$(cat $dest_dir/num_jobs) -mkdir -p $dest_dir/temp -cat << EOF > $dest_dir/temp/copy_ali.sh -set -e; -id=\$1 -echo "$src_dir/ali.\$id.gz" -gunzip -c $src_dir/ali.\$id.gz | \ - copy-int-vector ark:- ark,t:- | \ -python -c " -import sys -for line in sys.stdin: - parts = line.split() - print '$utt_prefix{0}$utt_suffix {1}'.format(parts[0], ' '.join(parts[1:])) -" | \ - gzip -c >$dest_dir/ali.\$id.gz || exit 1; -set +o pipefail; # unset the pipefail option. -EOF -chmod +x $dest_dir/temp/copy_ali.sh -$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_ali.JOB.log $dest_dir/temp/copy_ali.sh JOB || exit 1; - -echo "$0: copied alignments from $src_dir to $dest_dir" diff --git a/egs/zeroth_korean/s5/local/nnet2/run_5a_recData01.sh b/egs/zeroth_korean/s5/local/nnet2/run_5a_recData01.sh deleted file mode 100755 index c7e563906c6..00000000000 --- a/egs/zeroth_korean/s5/local/nnet2/run_5a_recData01.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -# This is p-norm neural net training, with the "fast" script, on top of adapted -# 40-dimensional features. - -# Modified by Lucas Jo 2017 (Altas Guide) - - -train_stage=-10 -use_gpu=true - -. cmd.sh -. ./path.sh -. utils/parse_options.sh - - -if $use_gpu; then - if ! cuda-compiled; then - cat < $from/reco2dur - - if [ -f $to/utt2dur ] ; then - rm $to/uttdur - fi - for i in `seq 0 ${num_data_reps}`; do - cat $from/reco2dur | sed -e "s/^/rev${i}_/" >> $to/utt2dur - done - ### - - - for datadir in ${trainset}_rvb${num_data_reps} ; do - steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - done - - # copy the alignments for the newly created utterance ids - ali_dirs= - for i in `seq 0 $num_data_reps`; do - local/multi_condition/copy_ali_dir.sh --cmd "$decode_cmd" --utt-prefix "rev${i}_" ${gmmdir}_ali_${trainset} ${gmmdir}_ali_${trainset}_temp_$i || exit 1; - ali_dirs+=" ${gmmdir}_ali_${trainset}_temp_$i" - done - steps/combine_ali_dirs.sh data/${trainset}_rvb${num_data_reps} ${gmmdir}_ali_${trainset}_rvb $ali_dirs || exit 1; - - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We align a subset of training data for - # this purpose. - utils/subset_data_dir.sh data/${trainset}_rvb${num_data_reps}_hires 100000 data/train_100k_hires - utils/subset_data_dir.sh data/${trainset}_rvb${num_data_reps}_hires 30000 data/train_30k_hires -fi - - -if [ $stage -le 4 ]; then - # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 - # because after we get the transform (12th iter is the last), any further - # training is pointless. - - mkdir exp -p exp/nnet3${rvb_affix} - - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ - --realign-iters "" \ - --splice-opts "--left-context=3 --right-context=3" \ - 3000 10000 data/train_100k_hires data/lang_nosp \ - ${gmmdir}_ali_${trainset}_rvb exp/nnet3${rvb_affix}/tri2b -fi - - -if [ $stage -le 5 ]; then - # To train a diagonal UBM we don't need very much data, so use a small subset - # (actually, it's not that small: still around 100 hours). - steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $nj --num_threads $maxThread --num-frames 700000 \ - data/train_30k_hires 512 exp/nnet3${rvb_affix}/tri2b exp/nnet3${rvb_affix}/diag_ubm -fi - -if [ $stage -le 6 ]; then - # iVector extractors can in general be sensitive to the amount of data, but - # this one has a fairly small dim (defaults to 100) so we don't use all of it, - # we use just the 3k subset (about one fifth of the data, or 200 hours). - steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/${trainset}_rvb${num_data_reps}_hires exp/nnet3${rvb_affix}/diag_ubm exp/nnet3${rvb_affix}/extractor || exit 1; -fi - -if [ $stage -le 7 ]; then - ivectordir=exp/nnet3${rvb_affix}/ivectors_${trainset}_rvb${num_data_reps}_hires - - # We extract iVectors on all the train data, which will be what we train the - # system on. With --utts-per-spk-max 2, the script. pairs the utterances - # into twos, and treats each of these pairs as one speaker. Note that these - # are extracted 'online'. - - # having a larger number of speakers is helpful for generalization, and to - # handle per-utterance decoding well (iVector starts at zero). - utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ - data/${trainset}_rvb${num_data_reps}_hires data/${trainset}_rvb${num_data_reps}_hires_max2 - - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ - data/${trainset}_rvb${num_data_reps}_hires_max2 exp/nnet3${rvb_affix}/extractor $ivectordir || exit 1; -fi - - -exit 0; diff --git a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh index 116070ab50b..b3b60629a8c 100755 --- a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh +++ b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh @@ -5,7 +5,7 @@ stage=0 -gmmdir=exp/tri4b +gmmdir=exp/tri4 speed_perturb=false trainset=train_clean @@ -55,9 +55,6 @@ if [ $stage -le 3 ]; then # have multiple copies of Kaldi checked out and run the same recipe, not to let # them overwrite each other. mfccdir=mfcc_hires - #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then - # utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage - #fi for datadir in ${trainset} ; do utils/copy_data_dir.sh data/$datadir data/${datadir}_hires @@ -84,7 +81,7 @@ if [ $stage -le 4 ]; then --realign-iters "" \ --splice-opts "--left-context=3 --right-context=3" \ 3000 10000 data/${trainset}_hires data/lang_nosp \ - ${gmmdir}_ali_${trainset} exp/nnet3/tri2b + ${gmmdir}_ali_${trainset} exp/nnet3/tri2 fi @@ -92,13 +89,10 @@ if [ $stage -le 5 ]; then # To train a diagonal UBM we don't need very much data, so use a small subset # (actually, it's not that small: still around 100 hours). steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \ - data/train_30k_hires 512 exp/nnet3/tri2b exp/nnet3/diag_ubm + data/train_30k_hires 512 exp/nnet3/tri2 exp/nnet3/diag_ubm fi if [ $stage -le 6 ]; then - # iVector extractors can in general be sensitive to the amount of data, but - # this one has a fairly small dim (defaults to 100) so we don't use all of it, - # we use just the 3k subset (about one fifth of the data, or 200 hours). steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ data/${trainset}_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; fi diff --git a/egs/zeroth_korean/s5/local/online/export_online_nnet2_model.sh b/egs/zeroth_korean/s5/local/online/export_online_nnet2_model.sh deleted file mode 100755 index a9b4a61c6d2..00000000000 --- a/egs/zeroth_korean/s5/local/online/export_online_nnet2_model.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Lucas Jo (Atlas Guide) -# Apache 2.0 - -if [ $# -ne "1" ]; then - echo "Usage: $0 " - echo "e.g.: $0 ./export" - exit 1 -fi - -tardir=$1 -srcdir=exp/nnet2_online/nnet_ms_a_online -graphdir=exp/tri5b/graph_tgsmall -oldlang=data/lang_test_tgsmall -newlang=data/lang_test_fglarge -oldlm=$oldlang/G.fst -newlm=$newlang/G.carpa -symtab=$newlang/words.txt - -for f in $srcdir/final.mdl $symtab $graphdir/HCLG.fst $srcdir/conf/mfcc.conf \ - $srcdir/conf/ivector_extractor.conf $oldlm $newlm; do - [ ! -f $f ] && echo "export_model.sh: no such file $f" && exit 1; -done - -mkdir -p $tardir/conf -cp -rpf $srcdir/final.mdl $tardir/final.mdl # acoustic model -cp -rpf $symtab $tardir/words.txt # word symbol table -cp -rpf $graphdir/HCLG.fst $tardir/HCLG.fst # HCLG -cp -rpf $srcdir/conf/mfcc.conf $tardir/conf/mfcc.conf -cp -rpf $srcdir/conf/ivector_extractor.conf $tardir/conf/ivector_extractor.conf -cp -rpf $oldlm $tardir/G.fst -cp -rpf $newlm $tardir/G.carpa diff --git a/egs/zeroth_korean/s5/local/online/run_nnet2_common.sh b/egs/zeroth_korean/s5/local/online/run_nnet2_common.sh deleted file mode 100755 index d1ac0a2f5d2..00000000000 --- a/egs/zeroth_korean/s5/local/online/run_nnet2_common.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash - -# this script contains some common (shared) parts of the run_nnet*.sh scripts. -# Modified by Lucas Jo 2017 (Altas Guide) -. cmd.sh - - -stage=0 - -set -e -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if [ $stage -le 1 ]; then - # Create high-resolution MFCC features (with 40 cepstra instead of 13). - # this shows how you can split across multiple file-systems. we'll split the - # MFCC dir across multiple locations. You might want to be careful here, if you - # have multiple copies of Kaldi checked out and run the same recipe, not to let - # them overwrite each other. - mfccdir=mfcc - #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then - # utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage - #fi - - for datadir in train_2x; do - utils/copy_data_dir.sh data/$datadir data/${datadir}_hires - steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - done - - # now create some data subsets. - # mixed is the clean+other data. - # a is 1/5 of the data, b is 2/5th of it. - utils/subset_data_dir.sh data/train_2x_hires 3000 data/train_mixed_hires_a - utils/subset_data_dir.sh data/train_2x_hires 6000 data/train_mixed_hires_b -fi - -if [ $stage -le 2 ]; then - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We align a subset of training data for - # this purpose. - utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_mixed_hires_a/utt2spk) \ - data/train_2x data/train_2x_a - - steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ - data/train_2x_a data/lang exp/tri5b exp/nnet2_online/tri5b_ali_a -fi - -if [ $stage -le 3 ]; then - # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 - # because after we get the transform (12th iter is the last), any further - # training is pointless. - #5000 10000 data/train_mixed_hires_a data/lang \ - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ - --realign-iters "" \ - --splice-opts "--left-context=3 --right-context=3" \ - 3000 20000 data/train_mixed_hires_a data/lang \ - exp/nnet2_online/tri5b_ali_a exp/nnet2_online/tri6b -fi - - -if [ $stage -le 4 ]; then - mkdir -p exp/nnet2_online - # To train a diagonal UBM we don't need very much data, so use a small subset - # (actually, it's not that small: still around 100 hours). - steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \ - data/train_mixed_hires_a 256 exp/nnet2_online/tri6b exp/nnet2_online/diag_ubm -fi - -if [ $stage -le 5 ]; then - # iVector extractors can in general be sensitive to the amount of data, but - # this one has a fairly small dim (defaults to 100) so we don't use all of it, - # we use just the 3k subset (about one fifth of the data, or 200 hours). - steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/train_mixed_hires_b exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1; -fi - -if [ $stage -le 6 ]; then - ivectordir=exp/nnet2_online/ivectors_train_2x_hires - #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then - # utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage - #fi - - # We extract iVectors on all the train data, which will be what we train the - # system on. With --utts-per-spk-max 2, the script. pairs the utterances - # into twos, and treats each of these pairs as one speaker. Note that these - # are extracted 'online'. - - # having a larger number of speakers is helpful for generalization, and to - # handle per-utterance decoding well (iVector starts at zero). - steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_2x_hires data/train_2x_hires_max2 - - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \ - data/train_2x_hires_max2 exp/nnet2_online/extractor $ivectordir || exit 1; -fi - - -exit 0; diff --git a/egs/zeroth_korean/s5/local/online/run_nnet2_ms.sh b/egs/zeroth_korean/s5/local/online/run_nnet2_ms.sh deleted file mode 100755 index d46e2f63667..00000000000 --- a/egs/zeroth_korean/s5/local/online/run_nnet2_ms.sh +++ /dev/null @@ -1,267 +0,0 @@ -#!/bin/bash - -# This is the "multi-splice" version of the online-nnet2 training script. -# It's currently the best recipe. -# You'll notice that we splice over successively larger windows as we go deeper -# into the network. - -# Modified by Lucas Jo 2017 (Altas Guide) - -. cmd.sh - - -stage=0 -train_stage=-10 -use_gpu=true -dir=exp/nnet2_online/nnet_ms_a -exit_train_stage=-100 - -set -e -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if $use_gpu; then - if ! cuda-compiled; then - cat < $trans rm -f $trans"_tmp_index" -#transcripList=$(find $dataDir -name "*.norm.txt" -type f | sort) -#for transcript in $transcripList; -#do -# echo "read: " $transcript -# cat $transcript | awk '{print $1;}' > tmp -# cat $transcript | awk '{$1="";print $0;}' | \ -# local/strip.py | \ -# #morfessor -l $lmDir/data/_lexicon_/mergedCorpus.model4.reduced -T - -o tmp2 --output-format '{analysis} ' --output-newlines -# morfessor -l $lmDir/zeroth_morfessor.seg -T - -o tmp2 --output-format '{analysis} ' --output-newlines -# #$lmDir/data/_lm_/seg2sentence.py tmp2 > tmp3 -# -# array=(${transcript//\./ }) -# echo "write: " ${array[0]}.${array[1]}.txt -# paste -d" " tmp tmp2 > ${array[0]}.${array[1]}.txt -#done -#rm -f tmp* diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh index 48a3834050c..033366f81b2 100755 --- a/egs/zeroth_korean/s5/run.sh +++ b/egs/zeroth_korean/s5/run.sh @@ -1,9 +1,8 @@ #!/bin/bash # -# Based mostly on the WSJ/Librispeech recipe. The training database is #####, -# it consists of 51hrs korean speech with cleaned automatic transcripts: -# -# http://www.openslr.org/resources (Mirror). +# Based mostly on the WSJ/Librispeech recipe. +# The training/testing database is described in http://www.openslr.org/40/ +# This corpus consists of 51hrs korean speech with cleaned automatic transcripts: # # Copyright 2018 Atlas Guide (Author : Lucas Jo) # 2018 Gridspace Inc. (Author: Wonkyum Lee) @@ -12,13 +11,17 @@ # # Check list before start -# 1. locale setup -# 2. pre-installed package: awscli, Morfessor-2.0.1, flac, sox, same cuda library, unzip -# 3. pre-install or symbolic link for easy going: rirs_noises.zip (takes pretty long time) -# 4. parameters: nCPU, num_jobs_initial, num_jobs_final, --max-noises-per-minute +# 1. locale setup (see egs/zeroth_korean/s5/path.sh; you need this "export LC_ALL=ko_KR.UTF-8" ) +# 2. required software: Morfessor-2.0.1 (see tools/extras/install_morfessor.sh) +stage=0 db_dir=./db -nCPU=16 +nj=16 + +chain_train=true +decode=true # set false if you don't want to decode each GMM model +decode_rescoring=true # set false if you don't want to rescore with large language model +test_set="test_clean" . ./cmd.sh . ./path.sh @@ -26,169 +29,237 @@ nCPU=16 # you might not want to do this for interactive shells. set -e -startTime=$(date +'%F-%H-%M') -echo "started at" $startTime +if [ $stage -le 0 ]; then + # download the data. + local/download_and_untar.sh $db_dir +fi -# download the data. -local/download_and_untar.sh $db_dir +if [ $stage -le 1 ]; then + # format the data as Kaldi data directories + for part in train_data_01 test_data_01; do + # use underscore-separated names in data directories. + local/data_prep.sh $db_dir/$part data/$part + done +fi -# format the data as Kaldi data directories -for part in train_data_01 test_data_01; do - # use underscore-separated names in data directories. - local/data_prep.sh $db_dir/$part data/$(echo $part | sed s/-/_/g) -done +if [ $stage -le 2 ]; then + # update segmentation of transcripts + for part in train_data_01 test_data_01; do + local/updateSegmentation.sh data/$part data/local/lm + done +fi -# update segmentation of transcripts -for part in train_data_01 test_data_01; do - local/updateSegmentation.sh data/$part data/local/lm -done +if [ $stage -le 3 ]; then + # prepare dictionary and language model + local/prepare_dict.sh data/local/lm data/local/dict_nosp + + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_tmp_nosp data/lang_nosp +fi -# prepare dictionary and language model -local/prepare_dict.sh data/local/lm data/local/dict_nosp +if [ $stage -le 4 ]; then + # build testing language model + local/format_lms.sh --src-dir data/lang_nosp data/local/lm + + # re-scoring language model + if $decode_rescoring ; then + utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.tg.arpa.gz \ + data/lang_nosp data/lang_nosp_test_tglarge + utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.fg.arpa.gz \ + data/lang_nosp data/lang_nosp_test_fglarge + fi +fi -utils/prepare_lang.sh data/local/dict_nosp \ - "" data/local/lang_tmp_nosp data/lang_nosp -local/format_lms.sh --src-dir data/lang_nosp data/local/lm +if [ $stage -le 5 ]; then + # Feature extraction (MFCC) + mfccdir=mfcc + for part in train_data_01 test_data_01; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$part exp/make_mfcc/$part $mfccdir + steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir + done + + # ... and then combine data sets into one (for later extension) + utils/combine_data.sh \ + data/train_clean data/train_data_01 + + utils/combine_data.sh \ + data/test_clean data/test_data_01 + + # Make some small data subsets for early system-build stages. + utils/subset_data_dir.sh --shortest data/train_clean 2000 data/train_2kshort + utils/subset_data_dir.sh data/train_clean 5000 data/train_5k + utils/subset_data_dir.sh data/train_clean 10000 data/train_10k +fi -# Create ConstArpaLm format language model for full 3-gram and 4-gram LMs -# it takes long time and do this again after computing silence prob. -# you can do comment out here this time +if [ $stage -le 5 ]; then + echo "#### Monophone Training ###########" + # train a monophone system & align + steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ + data/train_2kshort data/lang_nosp exp/mono + if $decode; then + utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/mono exp/mono/graph_nosp_tgsmall + nspk=$(wc -l " data/local/lang_tmp data/lang - -local/format_lms.sh --src-dir data/lang data/local/lm - -utils/build_const_arpa_lm.sh \ - data/local/lm/zeroth.lm.tg.arpa.gz data/lang data/lang_test_tglarge -utils/build_const_arpa_lm.sh \ - data/local/lm/zeroth.lm.fg.arpa.gz data/lang data/lang_test_fglarge - -# align the entire train_clean using the tri3b model -steps/align_fmllr.sh --nj $nCPU --cmd "$train_cmd" \ - data/train_clean data/lang exp/tri3b exp/tri3b_ali_train_clean - -echo "#### SAT again on train_clean ###########" -# train another LDA+MLLT+SAT system on the entire subset -steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ - data/train_clean data/lang exp/tri3b_ali_train_clean exp/tri4b - -# decode using the tri4b model with pronunciation and silence probabilities -utils/mkgraph.sh \ - data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall - -# the size is properly set? -utils/subset_data_dir.sh data/test_clean 200 data/test_200 - -for test in test_200; do - nspk=$(wc -l " data/local/lang_tmp data/lang + + local/format_lms.sh --src-dir data/lang data/local/lm + + utils/build_const_arpa_lm.sh \ + data/local/lm/zeroth.lm.tg.arpa.gz data/lang data/lang_test_tglarge + utils/build_const_arpa_lm.sh \ + data/local/lm/zeroth.lm.fg.arpa.gz data/lang data/lang_test_fglarge + + if $decode; then + utils/mkgraph.sh data/lang_test_tgsmall exp/tri3 exp/tri3/graph_tgsmall + nspk=$(wc -l Date: Mon, 9 Jul 2018 20:07:09 -0700 Subject: [PATCH 04/26] cmd.sh cleaninig --- egs/zeroth_korean/s5/cmd.sh | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/egs/zeroth_korean/s5/cmd.sh b/egs/zeroth_korean/s5/cmd.sh index 1687940f7d1..34031439792 100644 --- a/egs/zeroth_korean/s5/cmd.sh +++ b/egs/zeroth_korean/s5/cmd.sh @@ -10,16 +10,8 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="run.pl --mem 2G" -export decode_cmd="run.pl --mem 4G" -export mkgraph_cmd="run.pl --mem 8G" -export normalize_cmd="run.pl --mem 4G" +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +export normalize_cmd="queue.pl --mem 4G" -hostInAtlas="ares hephaestus jupiter neptune" -if [[ ! -z $(echo $hostInAtlas | grep -o $(hostname -f)) ]]; then - queue_conf=conf/queue.conf - export train_cmd="queue.pl --config $queue_conf --mem 4G" - export decode_cmd="queue.pl --config $queue_conf --mem 8G" - export mkgraph_cmd="queue.pl --config $queue_conf --mem 16G" - export normalize_cmd="queue.pl --config $queue_conf --mem 4G" -fi From b00a81390368d9265de94b9bb1af2a105d99623d Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Tue, 10 Jul 2018 08:57:41 -0700 Subject: [PATCH 05/26] run.sh script fix --- egs/zeroth_korean/s5/local/format_lms.sh | 4 +++- egs/zeroth_korean/s5/run.sh | 25 ++++++++++++------------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/egs/zeroth_korean/s5/local/format_lms.sh b/egs/zeroth_korean/s5/local/format_lms.sh index 5947ae6b620..a9111e80eeb 100755 --- a/egs/zeroth_korean/s5/local/format_lms.sh +++ b/egs/zeroth_korean/s5/local/format_lms.sh @@ -45,7 +45,9 @@ trap "rm -r $tmpdir" EXIT mkdir -p $tmpdir -for lm_suffix in tgsmall tgmed; do +#lm_sets="tgsmall tgmed" +lm_sets="tgsmall" +for lm_suffix in ${lm_sets}; do # tglarge is prepared by a separate command, called from run.sh; we don't # want to compile G.fst for tglarge, as it takes a while. test=${src_dir}_test_${lm_suffix} diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh index 033366f81b2..e410f9514d7 100755 --- a/egs/zeroth_korean/s5/run.sh +++ b/egs/zeroth_korean/s5/run.sh @@ -25,6 +25,7 @@ test_set="test_clean" . ./cmd.sh . ./path.sh +. utils/parse_options.sh # e.g. this parses the --stage option if supplied. # you might not want to do this for interactive shells. set -e @@ -92,7 +93,7 @@ if [ $stage -le 5 ]; then utils/subset_data_dir.sh data/train_clean 10000 data/train_10k fi -if [ $stage -le 5 ]; then +if [ $stage -le 6 ]; then echo "#### Monophone Training ###########" # train a monophone system & align steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ @@ -100,7 +101,7 @@ if [ $stage -le 5 ]; then if $decode; then utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/mono exp/mono/graph_nosp_tgsmall nspk=$(wc -l Date: Wed, 11 Jul 2018 10:45:19 -0700 Subject: [PATCH 06/26] add RESULTS page with minor typo fix --- egs/zeroth_korean/s5/RESULTS | 63 +++++++++++++++++++ .../s5/local/chain/tuning/run_tdnn_1a.sh | 4 +- egs/zeroth_korean/s5/run.sh | 7 +-- 3 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 egs/zeroth_korean/s5/RESULTS diff --git a/egs/zeroth_korean/s5/RESULTS b/egs/zeroth_korean/s5/RESULTS new file mode 100644 index 00000000000..d8503cfcac4 --- /dev/null +++ b/egs/zeroth_korean/s5/RESULTS @@ -0,0 +1,63 @@ +#!/bin/bash + +# this RESULTS file was obtained by Wonkyum Lee in July 2018. + +for dir in exp/*; do + steps/info/gmm_dir_info.pl $dir + for x in $dir/decode*test*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done +done +exit 0 + +# monophone, trained on the 2k shortest utterances +exp/mono: nj=16 align prob=-99.85 over 2.66h [retry=0.8%, fail=0.3%] states=130 gauss=1004 +%WER 70.24 [ 6499 / 9253, 295 ins, 1399 del, 4805 sub ] exp/mono/decode_nosp_fglarge_test_clean/wer_8_0.5 +%WER 71.28 [ 6596 / 9253, 185 ins, 1721 del, 4690 sub ] exp/mono/decode_nosp_tglarge_test_clean/wer_9_1.0 +%WER 78.83 [ 7294 / 9253, 218 ins, 1752 del, 5324 sub ] exp/mono/decode_nosp_tgsmall_test_clean/wer_10_0.0 + +# first triphone build, trained on 5k utterances +exp/tri1: nj=16 align prob=-98.34 over 11.55h [retry=1.6%, fail=0.6%] states=1568 gauss=10030 tree-impr=4.07 +%WER 37.44 [ 3464 / 9253, 258 ins, 725 del, 2481 sub ] exp/tri1/decode_nosp_fglarge_test_clean/wer_15_0.5 +%WER 38.85 [ 3595 / 9253, 347 ins, 633 del, 2615 sub ] exp/tri1/decode_nosp_tglarge_test_clean/wer_15_0.0 +%WER 53.23 [ 4925 / 9253, 296 ins, 1060 del, 3569 sub ] exp/tri1/decode_nosp_tgsmall_test_clean/wer_15_0.0 + +# tri2 is an LDA+MLLT systemm, trained on 10k utterances +exp/tri2: nj=16 align prob=-49.63 over 23.00h [retry=1.7%, fail=0.8%] states=2000 gauss=15039 tree-impr=4.70 lda-sum=18.11 mllt:impr,logdet=0.99,1.39 +%WER 33.50 [ 3100 / 9253, 248 ins, 626 del, 2226 sub ] exp/tri2/decode_nosp_fglarge_test_clean/wer_16_0.5 +%WER 34.55 [ 3197 / 9253, 315 ins, 537 del, 2345 sub ] exp/tri2/decode_nosp_tglarge_test_clean/wer_16_0.0 +%WER 48.98 [ 4532 / 9253, 303 ins, 903 del, 3326 sub ] exp/tri2/decode_nosp_tgsmall_test_clean/wer_14_0.0 + +# tri3 is an LDA+MLLT+SAT system, trained on entire clean training set +exp/tri3: nj=16 align prob=-48.95 over 51.22h [retry=1.6%, fail=0.7%] states=3336 gauss=40065 fmllr-impr=2.72 over 19.18h tree-impr=7.23 +%WER 23.89 [ 2211 / 9253, 233 ins, 404 del, 1574 sub ] exp/tri3/decode_nosp_fglarge_test_clean/wer_15_0.0 +%WER 24.47 [ 2264 / 9253, 252 ins, 385 del, 1627 sub ] exp/tri3/decode_nosp_tglarge_test_clean/wer_13_0.0 +%WER 37.81 [ 3499 / 9253, 274 ins, 671 del, 2554 sub ] exp/tri3/decode_nosp_tgsmall_test_clean/wer_13_0.0 +%WER 49.00 [ 4534 / 9253, 302 ins, 874 del, 3358 sub ] exp/tri3/decode_nosp_tgsmall_test_clean.si/wer_14_0.0 +%WER 21.68 [ 2006 / 9253, 226 ins, 346 del, 1434 sub ] exp/tri3/decode_fglarge_test_clean/wer_15_0.0 +%WER 22.59 [ 2090 / 9253, 231 ins, 372 del, 1487 sub ] exp/tri3/decode_tglarge_test_clean/wer_15_0.0 +%WER 34.83 [ 3223 / 9253, 294 ins, 605 del, 2324 sub ] exp/tri3/decode_tgsmall_test_clean/wer_12_0.0 +%WER 45.28 [ 4190 / 9253, 270 ins, 880 del, 3040 sub ] exp/tri3/decode_tgsmall_test_clean.si/wer_15_0.0 + +# tri4 is an LDA+MLLT+SAT system after estimating pronunciation probabilities +# and word-and-pronunciation-dependent silence probabilities. +exp/tri4: nj=16 align prob=-48.70 over 51.22h [retry=1.5%, fail=0.7%] states=3368 gauss=40039 fmllr-impr=0.23 over 42.91h tree-impr=7.87 +%WER 21.61 [ 2000 / 9253, 210 ins, 379 del, 1411 sub ] exp/tri4/decode_fglarge_test_clean/wer_14_0.5 +%WER 22.59 [ 2090 / 9253, 237 ins, 371 del, 1482 sub ] exp/tri4/decode_tglarge_test_clean/wer_15_0.0 +%WER 34.57 [ 3199 / 9253, 285 ins, 595 del, 2319 sub ] exp/tri4/decode_tgsmall_test_clean/wer_12_0.0 +%WER 45.82 [ 4240 / 9253, 270 ins, 833 del, 3137 sub ] exp/tri4/decode_tgsmall_test_clean.si/wer_13_0.0 + +for dir in exp/chain/tdnn*_sp; do + steps/info/chain_dir_info.pl $dir + for x in ${dir}_online/decode*test*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done +done +exit 0 + +# tdnn_1a is a kind of factorized TDNN, with skip connections. +exp/chain/tdnn_1a_sp: num-iters=72 nj=3..16 num-params=18.6M dim=40+100->3040 combine=-0.046->-0.045 (over 3) xent:train/valid[47,71,final]=(-0.898,-0.775,-0.766/-0.967,-0.855,-0.845) logprob:train/valid[47,71,final]=(-0.056,-0.043,-0.043/-0.069,-0.057,-0.057) +%WER 11.42 [ 1057 / 9253, 128 ins, 193 del, 736 sub ] exp/chain/tdnn_1a_sp_online/decode_fglarge_test_clean/wer_16_1.0 +%WER 19.25 [ 1781 / 9253, 188 ins, 291 del, 1302 sub ] exp/chain/tdnn_1a_sp_online/decode_tgsmall_test_clean/wer_11_0.5 + +# This chain system has TDNN+Norm-OPGRU architecture. +exp/chain/tdnn_opgru_1a_sp: num-iters=130 nj=2..12 num-params=37.9M dim=40+100->3000 combine=-0.040->-0.038 (over 6) xent:train/valid[85,129,final]=(-1.12,-0.608,-0.616/-1.21,-0.697,-0.705) logprob:train/valid[85,129,final]=(-0.062,-0.027,-0.027/-0.067,-0.030,-0.030) +%WER 9.33 [ 863 / 9253, 101 ins, 162 del, 600 sub ] exp/chain/tdnn_opgru_1a_sp_online/decode_fglarge_test_clean/wer_8_1.0 +%WER 15.13 [ 1400 / 9253, 154 ins, 217 del, 1029 sub ] exp/chain/tdnn_opgru_1a_sp_online/decode_tgsmall_test_clean/wer_9_0.0 + diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh index 36ef3f08aad..0a01d08b8f4 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh @@ -58,13 +58,15 @@ fi local/nnet3/run_ivector_common.sh --stage $stage --speed-perturb ${speed_perturb} +suffix= if [ "$speed_perturb" == "true" ]; then train_set=${train_set}_sp + suffix=_sp fi gmm_dir=exp/${gmm} lat_dir=exp/chain/${gmm}_${train_set}_lats -dir=exp/chain/tdnn${affix} +dir=exp/chain/tdnn_${affix}${suffix} train_data_dir=data/${train_set}_hires train_ivector_dir=exp/nnet3/ivectors_${train_set}_hires lores_train_data_dir=data/${train_set} diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh index e410f9514d7..32f99863cc5 100755 --- a/egs/zeroth_korean/s5/run.sh +++ b/egs/zeroth_korean/s5/run.sh @@ -95,7 +95,7 @@ fi if [ $stage -le 6 ]; then echo "#### Monophone Training ###########" - # train a monophone system & align + # train a monophone system with 2k short utts steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ data/train_2kshort data/lang_nosp exp/mono if $decode; then @@ -118,8 +118,7 @@ if [ $stage -le 7 ]; then echo "#### Triphone Training, delta + delta-delta ###########" steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k - # train a first delta + delta-delta triphone system on a subset of 5000 utterancesa - # number of maximum pdf, gaussian (under/over fitting) + # train a first delta + delta-delta triphone system on a subset of 5000 utterances steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 if $decode; then @@ -257,7 +256,7 @@ fi echo "GMM trainig is Done" if $chain_train; then - ## online chain recipe using only clean data set + ## Training Chain Acoustic model using clean data set echo "#### chain training ###########" local/chain/run_tdnn.sh fi From 5c22bab9483015b628219c6770c3cd3faa08ba68 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Thu, 12 Jul 2018 09:33:32 -0700 Subject: [PATCH 07/26] run_tdnn_1a.sh fix --- .../s5/local/chain/tuning/run_tdnn_1a.sh | 125 +++++++++--------- 1 file changed, 61 insertions(+), 64 deletions(-) diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh index 0a01d08b8f4..381b13492d0 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh @@ -20,22 +20,18 @@ common_egs_dir= # LSTM/chain options train_stage=-10 xent_regularize=0.1 -max_param_change=2.0 +dropout_schedule='0,0@0.20,0.5@0.50,0' # training chunk-options -get_egs_stage=-10 -chunk_width=150,110,100 +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 # training options -num_jobs_initial=3 -num_jobs_final=16 -num_epochs=6 -minibatch_size=128 -initial_effective_lrate=0.001 -final_effective_lrate=0.0001 +srand=0 remove_egs=true - #decode options test_online_decoding=true # if true, it will run the last decoding stage. @@ -66,7 +62,7 @@ fi gmm_dir=exp/${gmm} lat_dir=exp/chain/${gmm}_${train_set}_lats -dir=exp/chain/tdnn_${affix}${suffix} +dir=exp/chain/tdnn${affix}${suffix} train_data_dir=data/${train_set}_hires train_ivector_dir=exp/nnet3/ivectors_${train_set}_hires lores_train_data_dir=data/${train_set} @@ -142,14 +138,16 @@ fi if [ $stage -le 11 ]; then mkdir -p $dir echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - opts="l2-regularize=0.002" - linear_opts="orthonormal-constraint=1.0" - output_opts="l2-regularize=0.0005 bottleneck-dim=256" + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" mkdir -p $dir/configs - cat < $dir/configs/network.xconfig input dim=100 name=ivector input dim=40 name=input @@ -160,34 +158,28 @@ if [ $stage -le 11 ]; then fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 $opts dim=1280 - linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn3l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn3 $opts dim=1280 - linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn5l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) - linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 - linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 - linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 - linear-component name=prefinal-l dim=256 $linear_opts - - relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 output-layer name=output include-log-softmax=false dim=$num_targets $output_opts - relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi @@ -199,33 +191,38 @@ if [ $stage -le 12 ]; then /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.0 \ - --chain.apply-deriv-weights false \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.max-param-change $max_param_change \ - --trainer.num-epochs $num_epochs \ - --trainer.frames-per-iter 1500000 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --egs.stage $get_egs_stage \ - --egs.chunk-width $chunk_width \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --cleanup.remove-egs $remove_egs \ - --use-gpu true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir || exit 1; + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; + fi if [ $stage -le 13 ]; then From 73b9bdbb0876454d7cedd1553db863d473b3c2d3 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Fri, 13 Jul 2018 15:51:39 -0700 Subject: [PATCH 08/26] tdnn_opgru_1a change --- .../local/chain/tuning/run_tdnn_opgru_1a.sh | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh index e0404cd3d7c..4fd92f5b346 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -20,33 +20,25 @@ common_egs_dir= # OPGRU/chain options train_stage=-10 get_egs_stage=-10 + xent_regularize=0.1 -label_delay=5 -max_param_change=2.0 +dropout_schedule='0,0@0.20,0.2@0.50,0' -# training chunk-options -chunk_width=150 +chunk_width=140,100,160 chunk_left_context=40 chunk_right_context=0 -frames_per_chunk= +label_delay=5 -extra_left_context=50 -extra_right_context=0 - -# training options -srand=0 -num_jobs_initial=2 -num_jobs_final=12 -num_epochs=8 -initial_effective_lrate=0.001 -final_effective_lrate=0.0001 -dropout_schedule='0,0@0.20,0.2@0.50,0' remove_egs=true #decode options test_online_decoding=true # if true, it will run the last decoding stage. +# decode options +extra_left_context=50 +frames_per_chunk= + # End configuration section. echo "$0 $@" # Print the command line for logging @@ -74,7 +66,7 @@ fi gmm_dir=exp/${gmm} lat_dir=exp/chain/${gmm}_${train_set}_lats -dir=exp/chain/tdnn_opgru_${affix}${suffix} +dir=exp/chain/tdnn_opgru${affix}${suffix} train_data_dir=data/${train_set}_hires train_ivector_dir=exp/nnet3/ivectors_${train_set}_hires lores_train_data_dir=data/${train_set} @@ -225,14 +217,14 @@ if [ $stage -le 12 ]; then --egs.chunk-left-context-initial 0 \ --egs.chunk-right-context-final 0 \ --trainer.num-chunk-per-minibatch 64,32 \ - --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs=8 \ --trainer.optimization.shrink-value 0.99 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.max-param-change $max_param_change \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ --trainer.deriv-truncate-margin 8 \ --cleanup.remove-egs true \ --feat-dir $train_data_dir \ @@ -275,6 +267,7 @@ if $test_online_decoding && [ $stage -le 14 ]; then for lmtype in tgsmall; do steps/online/nnet3/decode.sh \ --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial 0 \ --nj $nspk --cmd "$decode_cmd" \ $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_test_${data_affix} || exit 1 done From bc67d9f94f0575b55593feddfd86e4dd0103e8d9 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Fri, 13 Jul 2018 16:03:53 -0700 Subject: [PATCH 09/26] add README.txt --- egs/zeroth_korean/s5/README.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 egs/zeroth_korean/s5/README.txt diff --git a/egs/zeroth_korean/s5/README.txt b/egs/zeroth_korean/s5/README.txt new file mode 100644 index 00000000000..daa007362d8 --- /dev/null +++ b/egs/zeroth_korean/s5/README.txt @@ -0,0 +1,13 @@ +Zeroth-Korean kaldi example is from Zeroth Project. Zeroth project introduces free Korean speech corpus and aims to make Korean speech recognition more broadly accessible to everyone. This project was developed in collaboration between Lucas Jo(@Atlas Guide Inc.) and Wonkyum Lee(@Gridspace Inc.). + +In this example, we are using 51.6 hours transcribed Korean audio for training data (22,263 utterances, 105 people, 3000 sentences) and 1.2 hours transcribed Korean audio for testing data (457 utterances, 10 people). Besides audio and transcription, we provide pre-trained/designed language model, lexicon and morpheme-based segmenter(morfessor) + +The database can be also downloaded from openslr: +http://www.openslr.org/40 + +The database is licensed under Attribution 4.0 International (CC BY 4.0) + +This folder contains a speech recognition recipe which is based on WSJ/Librispeech example. + +For more details about Zeroth project, please visit: +https://github.com/goodatlas/zeroth From 6554ff0ddb4dc11ff1b51c377ee29cf4b8576a0b Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Fri, 13 Jul 2018 16:23:37 -0700 Subject: [PATCH 10/26] compare_wer.sh script --- .../s5/local/chain/compare_wer.sh | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100755 egs/zeroth_korean/s5/local/chain/compare_wer.sh diff --git a/egs/zeroth_korean/s5/local/chain/compare_wer.sh b/egs/zeroth_korean/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..e8366bfb358 --- /dev/null +++ b/egs/zeroth_korean/s5/local/chain/compare_wer.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER test_clean (tgsmall) " + "#WER test_clean (fglarge) ") + +for n in 0 1 ; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_test_clean fglarge_test_clean) + + wer=$(grep WER ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo From 7e82148abdc024d55ba03f6cb76f3aec9fde727d Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Fri, 13 Jul 2018 16:23:49 -0700 Subject: [PATCH 11/26] result and diagnostics added --- .../s5/local/chain/tuning/run_tdnn_1a.sh | 17 +++++++++++++++++ .../s5/local/chain/tuning/run_tdnn_opgru_1a.sh | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh index 381b13492d0..3809c1cc31c 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh @@ -2,6 +2,23 @@ set -e -o pipefail +# This recipe trains TDNN-F AM +# The training recipe is from WSJ example(egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh) + +# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp +# exp/chain/tdnn1a_sp: num-iters=174 nj=2..8 num-params=8.4M dim=40+100->3040 combine=-0.049->-0.048 (over 3) xent:train/valid[115,173,final]=(-1.23,-0.838,-0.839/-1.22,-0.863,-0.859) logprob:train/valid[115,173,final]=(-0.091,-0.053,-0.053/-0.087,-0.056,-0.055) + +# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp +# System tdnn1a_sp +#WER test_clean (tgsmall) 19.11 +#WER test_clean (fglarge) 11.06 +# Final train prob -0.0527 +# Final valid prob -0.0545 +# Final train prob (xent) -0.8395 +# Final valid prob (xent) -0.8590 +# Num-params 8426432 + + # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). stage=0 diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh index 4fd92f5b346..1ea023c4b42 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -2,6 +2,23 @@ set -e -o pipefail +# This is recipe using TDNN+Norm-OPGRU. +# The recipe is based on AMI example.(egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh) + +# steps/info/chain_dir_info.pl exp/chain/tdnn_opgru1a_sp +# exp/chain/tdnn_opgru1a_sp: num-iters=99 nj=2..12 num-params=38.0M dim=40+100->3040 combine=-0.045->-0.045 (over 1) xent:train/valid[65,98,final]=(-1.19,-0.661,-0.647/-1.21,-0.696,-0.680) logprob:train/valid[65,98,final]=(-0.080,-0.039,-0.038/-0.076,-0.039,-0.038) + +# ./local/chain/compare_wer.sh exp/chain/tdnn_opgru1a_sp +# System tdnn_opgru1a_sp +#WER test_clean (tgsmall) 15.17 +#WER test_clean (fglarge) 9.14 +# Final train prob -0.0380 +# Final valid prob -0.0378 +# Final train prob (xent) -0.6470 +# Final valid prob (xent) -0.6805 +# Num-params 37970368 + + # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). stage=0 From 62d6b36850f65618aa873035d1c1deb5fe090f6b Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Fri, 13 Jul 2018 16:37:16 -0700 Subject: [PATCH 12/26] frames-per-chunk added on decoding script --- egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh index 1ea023c4b42..8fc949f24ae 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -284,6 +284,7 @@ if $test_online_decoding && [ $stage -le 14 ]; then for lmtype in tgsmall; do steps/online/nnet3/decode.sh \ --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 140 \ --extra-left-context-initial 0 \ --nj $nspk --cmd "$decode_cmd" \ $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_test_${data_affix} || exit 1 From 8ec007981ad5778d93c7cce08cdc0cd7e085c09a Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Fri, 13 Jul 2018 16:39:48 -0700 Subject: [PATCH 13/26] chunk left right this is from egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh --- .../s5/local/chain/tuning/run_tdnn_opgru_1a.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh index 8fc949f24ae..097d9f4f4e9 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -42,8 +42,6 @@ xent_regularize=0.1 dropout_schedule='0,0@0.20,0.2@0.50,0' chunk_width=140,100,160 -chunk_left_context=40 -chunk_right_context=0 label_delay=5 remove_egs=true @@ -226,8 +224,8 @@ if [ $stage -le 12 ]; then --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ - --egs.chunk-left-context $chunk_left_context \ - --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context 40 \ + --egs.chunk-right-context 0 \ --trainer.dropout-schedule $dropout_schedule \ --trainer.optimization.backstitch-training-scale 0.3 \ --trainer.optimization.backstitch-training-interval 1 \ From 3526f60871cb0017082fc2b8fdfe7519a5fa2691 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Fri, 13 Jul 2018 16:45:16 -0700 Subject: [PATCH 14/26] omit $mfccdir --- egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh index b3b60629a8c..38f9871e1f0 100755 --- a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh +++ b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh @@ -54,13 +54,12 @@ if [ $stage -le 3 ]; then # MFCC dir across multiple locations. You might want to be careful here, if you # have multiple copies of Kaldi checked out and run the same recipe, not to let # them overwrite each other. - mfccdir=mfcc_hires for datadir in ${trainset} ; do utils/copy_data_dir.sh data/$datadir data/${datadir}_hires steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; done # We need to build a small system just because we need the LDA+MLLT transform From 6d19ab2765a625b30ab6c4a06b84a344126c829e Mon Sep 17 00:00:00 2001 From: Lucas Jo Date: Mon, 20 Aug 2018 15:10:31 +0000 Subject: [PATCH 15/26] removed locale dependency --- egs/zeroth_korean/s5/local/updateSegmentation.sh | 2 +- egs/zeroth_korean/s5/path.sh | 2 +- egs/zeroth_korean/s5/run.sh | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/egs/zeroth_korean/s5/local/updateSegmentation.sh b/egs/zeroth_korean/s5/local/updateSegmentation.sh index aa025765aae..e1eea821645 100755 --- a/egs/zeroth_korean/s5/local/updateSegmentation.sh +++ b/egs/zeroth_korean/s5/local/updateSegmentation.sh @@ -28,7 +28,7 @@ cp $trans $trans".old" awk '{print $1}' $trans".old" > $trans"_tmp_index" cut -d' ' -f2- $trans".old" |\ sed -E 's/\s+/ /g; s/^\s//g; s/\s$//g' |\ - morfessor -l $lmDir/zeroth_morfessor.seg -T - -o - \ + morfessor -e 'utf-8' -l $lmDir/zeroth_morfessor.seg -T - -o - \ --output-format '{analysis} ' --output-newlines \ --nosplit-re '[0-9\[\]\(\){}a-zA-Z&.,\-]+' \ | paste -d" " $trans"_tmp_index" - > $trans diff --git a/egs/zeroth_korean/s5/path.sh b/egs/zeroth_korean/s5/path.sh index 91c09618924..2d17b17a84a 100755 --- a/egs/zeroth_korean/s5/path.sh +++ b/egs/zeroth_korean/s5/path.sh @@ -3,4 +3,4 @@ export KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh -export LC_ALL=ko_KR.UTF-8 +export LC_ALL=C diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh index 32f99863cc5..fbc584e163a 100755 --- a/egs/zeroth_korean/s5/run.sh +++ b/egs/zeroth_korean/s5/run.sh @@ -11,8 +11,7 @@ # # Check list before start -# 1. locale setup (see egs/zeroth_korean/s5/path.sh; you need this "export LC_ALL=ko_KR.UTF-8" ) -# 2. required software: Morfessor-2.0.1 (see tools/extras/install_morfessor.sh) +# 1. required software: Morfessor-2.0.1 (see tools/extras/install_morfessor.sh) stage=0 db_dir=./db From 11d1a07d52a1a66b1f24b7b34f1b38ca995f36d4 Mon Sep 17 00:00:00 2001 From: Lucas Jo Date: Mon, 20 Aug 2018 15:12:31 +0000 Subject: [PATCH 16/26] removed locale dependency --- egs/zeroth_korean/s5/local/updateSegmentation.sh | 2 +- egs/zeroth_korean/s5/path.sh | 2 +- egs/zeroth_korean/s5/run.sh | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/egs/zeroth_korean/s5/local/updateSegmentation.sh b/egs/zeroth_korean/s5/local/updateSegmentation.sh index aa025765aae..e1eea821645 100755 --- a/egs/zeroth_korean/s5/local/updateSegmentation.sh +++ b/egs/zeroth_korean/s5/local/updateSegmentation.sh @@ -28,7 +28,7 @@ cp $trans $trans".old" awk '{print $1}' $trans".old" > $trans"_tmp_index" cut -d' ' -f2- $trans".old" |\ sed -E 's/\s+/ /g; s/^\s//g; s/\s$//g' |\ - morfessor -l $lmDir/zeroth_morfessor.seg -T - -o - \ + morfessor -e 'utf-8' -l $lmDir/zeroth_morfessor.seg -T - -o - \ --output-format '{analysis} ' --output-newlines \ --nosplit-re '[0-9\[\]\(\){}a-zA-Z&.,\-]+' \ | paste -d" " $trans"_tmp_index" - > $trans diff --git a/egs/zeroth_korean/s5/path.sh b/egs/zeroth_korean/s5/path.sh index 91c09618924..2d17b17a84a 100755 --- a/egs/zeroth_korean/s5/path.sh +++ b/egs/zeroth_korean/s5/path.sh @@ -3,4 +3,4 @@ export KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh -export LC_ALL=ko_KR.UTF-8 +export LC_ALL=C diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh index 32f99863cc5..fbc584e163a 100755 --- a/egs/zeroth_korean/s5/run.sh +++ b/egs/zeroth_korean/s5/run.sh @@ -11,8 +11,7 @@ # # Check list before start -# 1. locale setup (see egs/zeroth_korean/s5/path.sh; you need this "export LC_ALL=ko_KR.UTF-8" ) -# 2. required software: Morfessor-2.0.1 (see tools/extras/install_morfessor.sh) +# 1. required software: Morfessor-2.0.1 (see tools/extras/install_morfessor.sh) stage=0 db_dir=./db From c75b1f83bd82687c4ed0d9eef4e4ce8471d53c38 Mon Sep 17 00:00:00 2001 From: Lucas Jo Date: Mon, 20 Aug 2018 15:28:28 +0000 Subject: [PATCH 17/26] changed filename --- .../s5/local/{updateSegmentation.sh => update_segmentation.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename egs/zeroth_korean/s5/local/{updateSegmentation.sh => update_segmentation.sh} (100%) diff --git a/egs/zeroth_korean/s5/local/updateSegmentation.sh b/egs/zeroth_korean/s5/local/update_segmentation.sh similarity index 100% rename from egs/zeroth_korean/s5/local/updateSegmentation.sh rename to egs/zeroth_korean/s5/local/update_segmentation.sh From d1b227779849b6563dfc95a73b500ee280b93248 Mon Sep 17 00:00:00 2001 From: Lucas Jo Date: Mon, 20 Aug 2018 15:42:33 +0000 Subject: [PATCH 18/26] re-indented with no tab --- egs/zeroth_korean/s5/local/prepare_dict.sh | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/egs/zeroth_korean/s5/local/prepare_dict.sh b/egs/zeroth_korean/s5/local/prepare_dict.sh index a4038ed7f43..76c6821e11e 100755 --- a/egs/zeroth_korean/s5/local/prepare_dict.sh +++ b/egs/zeroth_korean/s5/local/prepare_dict.sh @@ -7,9 +7,9 @@ # Prepare dictionary if [ $# -ne 2 ]; then - echo "Usage: $0 " - echo "e.g.: /data/local/lm data/local/dict_nosp" - exit 1 + echo "Usage: $0 " + echo "e.g.: /data/local/lm data/local/dict_nosp" + exit 1 fi lm_dir=$1 dst_dir=$2 @@ -20,7 +20,7 @@ mkdir -p $dst_dir || exit 1; lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt if [[ ! -s "$lexicon_raw_nosil" ]]; then - cp $lm_dir/zeroth_lexicon $lexicon_raw_nosil || exit 1 + cp $lm_dir/zeroth_lexicon $lexicon_raw_nosil || exit 1 fi silence_phones=$dst_dir/silence_phones.txt @@ -35,31 +35,31 @@ echo SIL > $optional_silence # nonsilence phones; on each line is a list of phones that correspond # really to the same base phone. awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\ - sort -u |\ - perl -e 'while(<>){ -chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; -$phones_of{$1} .= "$_ "; } -foreach $list (values %phones_of) {print $list . "\n"; } ' \ - > $nonsil_phones || exit 1; + sort -u |\ + perl -e 'while(<>){ + chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; + $phones_of{$1} .= "$_ "; } + foreach $list (values %phones_of) {print $list . "\n"; } ' \ + > $nonsil_phones || exit 1; # A few extra questions that will be added to those obtained by # automatically clustering # the "real" phones. These ask about stress; there's also one for # silence. cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1; cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)){ - $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ - >> $extra_questions || exit 1; +$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $extra_questions || exit 1; echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones" echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence" echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones" -echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions" +echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions" #(echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH'; # echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH' # echo ' NSN' ) | \ (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) |\ -cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt + cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt echo "Lexicon text file saved as: $dst_dir/lexicon.txt" exit 0 From 17378f2af6d1be0b6b17edb7a851a428fbd62ea7 Mon Sep 17 00:00:00 2001 From: Lucas Jo Date: Mon, 20 Aug 2018 15:57:46 +0000 Subject: [PATCH 19/26] changed to use PCA instead of LDA+MLLT --- egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh index 38f9871e1f0..ea186be7b90 100755 --- a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh +++ b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh @@ -76,19 +76,17 @@ if [ $stage -le 4 ]; then mkdir exp -p exp/nnet3 - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ - --realign-iters "" \ + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ - 3000 10000 data/${trainset}_hires data/lang_nosp \ - ${gmmdir}_ali_${trainset} exp/nnet3/tri2 + --max-utts 30000 --subsample 2 \ + data/${trainset}_hires exp/nnet3/pca_transform fi - if [ $stage -le 5 ]; then # To train a diagonal UBM we don't need very much data, so use a small subset # (actually, it's not that small: still around 100 hours). steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \ - data/train_30k_hires 512 exp/nnet3/tri2 exp/nnet3/diag_ubm + data/train_30k_hires 512 exp/nnet3/pca_transform exp/nnet3/diag_ubm fi if [ $stage -le 6 ]; then From 90400dd5d54ff73accfd3d99668ff48ef79e4140 Mon Sep 17 00:00:00 2001 From: Lucas Jo Date: Mon, 20 Aug 2018 16:02:59 +0000 Subject: [PATCH 20/26] added -bash on echo statements --- egs/zeroth_korean/s5/run.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh index fbc584e163a..a049a2c2597 100755 --- a/egs/zeroth_korean/s5/run.sh +++ b/egs/zeroth_korean/s5/run.sh @@ -93,7 +93,7 @@ if [ $stage -le 5 ]; then fi if [ $stage -le 6 ]; then - echo "#### Monophone Training ###########" + echo "$0: #### Monophone Training ###########" # train a monophone system with 2k short utts steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ data/train_2kshort data/lang_nosp exp/mono @@ -114,7 +114,7 @@ if [ $stage -le 6 ]; then fi if [ $stage -le 7 ]; then - echo "#### Triphone Training, delta + delta-delta ###########" + echo "$0: #### Triphone Training, delta + delta-delta ###########" steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k # train a first delta + delta-delta triphone system on a subset of 5000 utterances @@ -137,7 +137,7 @@ if [ $stage -le 7 ]; then fi if [ $stage -le 8 ]; then - echo "#### Triphone Training, LDA+MLLT ###########" + echo "$0: #### Triphone Training, LDA+MLLT ###########" steps/align_si.sh --nj $nj --cmd "$train_cmd" \ data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k # train an LDA+MLLT system. @@ -162,7 +162,7 @@ fi if [ $stage -le 9 ]; then - echo "#### Triphone Training, LDA+MLLT+SAT ###########" + echo "$0: #### Triphone Training, LDA+MLLT+SAT ###########" # Align the entire train_clean using the tri2 model steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \ data/train_clean data/lang_nosp exp/tri2 exp/tri2_ali_train_clean @@ -187,7 +187,7 @@ if [ $stage -le 9 ]; then fi if [ $stage -le 10 ]; then - echo "#### Re-computing pronunciation model using tri3 model ###########" + echo "$0: #### Re-computing pronunciation model using tri3 model ###########" # Now we compute the pronunciation and silence probabilities from training data, # and re-create the lang directory. # silence transition probability ... @@ -227,7 +227,7 @@ fi if [ $stage -le 11 ]; then - echo "#### SAT again on train_clean ###########" + echo "$0: #### SAT again on train_clean ###########" # align the entire train_clean using the tri3 model steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/train_clean data/lang exp/tri3 exp/tri3_ali_train_clean @@ -252,11 +252,11 @@ if [ $stage -le 11 ]; then fi fi -echo "GMM trainig is Done" +echo "$0: GMM trainig is Done" if $chain_train; then ## Training Chain Acoustic model using clean data set - echo "#### chain training ###########" + echo "$0: #### chain training ###########" local/chain/run_tdnn.sh fi From 6d010aab37c0771dfacf5c32a5ba96fbd0448264 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Mon, 27 Aug 2018 14:43:59 -0700 Subject: [PATCH 21/26] fix pointing update_segmentation.sh in run.sh --- egs/zeroth_korean/s5/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh index a049a2c2597..58db7a93ad9 100755 --- a/egs/zeroth_korean/s5/run.sh +++ b/egs/zeroth_korean/s5/run.sh @@ -45,7 +45,7 @@ fi if [ $stage -le 2 ]; then # update segmentation of transcripts for part in train_data_01 test_data_01; do - local/updateSegmentation.sh data/$part data/local/lm + local/update_segmentation.sh data/$part data/local/lm done fi From bd8094f32e1c051480968477e785aa3c340dfa12 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Mon, 27 Aug 2018 14:45:12 -0700 Subject: [PATCH 22/26] simplified and added echo statement --- .../s5/local/nnet3/run_ivector_common.sh | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh index ea186be7b90..7bde9a1ad9b 100755 --- a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh +++ b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh @@ -17,30 +17,22 @@ set -e if [ "$speed_perturb" == "true" ]; then if [ $stage -le 1 ]; then + echo "$0: preparing directory for speed-perturbed data" #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment # _sp stands for speed-perturbed - for datadir in ${trainset} ; do - utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 - utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 - utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 - utils/validate_data_dir.sh --no-feats data/${datadir}_tmp - rm -r data/temp1 data/temp2 + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp mfccdir=mfcc_perturbed steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 \ - data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; - utils/fix_data_dir.sh data/${datadir}_tmp - - utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 - utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; utils/fix_data_dir.sh data/${datadir}_sp - rm -r data/temp0 data/${datadir}_tmp done fi if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" #obtain the alignment of the perturbed data steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ data/${trainset}_sp data/lang_nosp ${gmmdir} ${gmmdir}_ali_${trainset}_sp || exit 1 @@ -55,6 +47,7 @@ if [ $stage -le 3 ]; then # have multiple copies of Kaldi checked out and run the same recipe, not to let # them overwrite each other. + echo "$0: creating high-resolution MFCC features" for datadir in ${trainset} ; do utils/copy_data_dir.sh data/$datadir data/${datadir}_hires steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \ @@ -74,8 +67,8 @@ if [ $stage -le 4 ]; then # because after we get the transform (12th iter is the last), any further # training is pointless. + echo "$0: computing a PCA transform from the hires data." mkdir exp -p exp/nnet3 - steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ --max-utts 30000 --subsample 2 \ @@ -84,12 +77,15 @@ fi if [ $stage -le 5 ]; then # To train a diagonal UBM we don't need very much data, so use a small subset - # (actually, it's not that small: still around 100 hours). + echo "$0: computing a PCA transform from the hires data." steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \ data/train_30k_hires 512 exp/nnet3/pca_transform exp/nnet3/diag_ubm fi if [ $stage -le 6 ]; then + # Train the iVector extractor. Use all of the speed-perturbed data since iVector extractors + # can be sensitive to the amount of data. The script defaults to an iVector dimension of 100 + echo "$0: training the iVector extractor" steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ data/${trainset}_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; fi @@ -104,6 +100,7 @@ if [ $stage -le 7 ]; then # having a larger number of speakers is helpful for generalization, and to # handle per-utterance decoding well (iVector starts at zero). + echo "$0: extracing iVector using trained iVector extractor" utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ data/${trainset}_hires data/${trainset}_hires_max2 From 7b55b5f5a79ac2080f66727ecb2794f31d1ca4c9 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Tue, 28 Aug 2018 09:45:17 -0700 Subject: [PATCH 23/26] results updated --- egs/zeroth_korean/s5/RESULTS | 13 +++++++------ .../s5/local/chain/tuning/run_tdnn_1a.sh | 11 +++++------ .../s5/local/chain/tuning/run_tdnn_opgru_1a.sh | 12 ++++++------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/egs/zeroth_korean/s5/RESULTS b/egs/zeroth_korean/s5/RESULTS index d8503cfcac4..976157fa584 100644 --- a/egs/zeroth_korean/s5/RESULTS +++ b/egs/zeroth_korean/s5/RESULTS @@ -52,12 +52,13 @@ done exit 0 # tdnn_1a is a kind of factorized TDNN, with skip connections. -exp/chain/tdnn_1a_sp: num-iters=72 nj=3..16 num-params=18.6M dim=40+100->3040 combine=-0.046->-0.045 (over 3) xent:train/valid[47,71,final]=(-0.898,-0.775,-0.766/-0.967,-0.855,-0.845) logprob:train/valid[47,71,final]=(-0.056,-0.043,-0.043/-0.069,-0.057,-0.057) -%WER 11.42 [ 1057 / 9253, 128 ins, 193 del, 736 sub ] exp/chain/tdnn_1a_sp_online/decode_fglarge_test_clean/wer_16_1.0 -%WER 19.25 [ 1781 / 9253, 188 ins, 291 del, 1302 sub ] exp/chain/tdnn_1a_sp_online/decode_tgsmall_test_clean/wer_11_0.5 +exp/chain/tdnn1a_sp: num-iters=174 nj=2..8 num-params=8.4M dim=40+100->3040 combine=-0.049->-0.048 (over 3) xent:train/valid[115,173,final]=(-1.21,-0.841,-0.837/-1.20,-0.856,-0.853) logprob:train/valid[115,173,final]=(-0.091,-0.053,-0.053/-0.084,-0.055,-0.054) +%WER 11.08 [ 1025 / 9253, 155 ins, 155 del, 715 sub ] exp/chain/tdnn1a_sp_online/decode_fglarge_test_clean/wer_11_0.0 +%WER 18.93 [ 1752 / 9253, 209 ins, 273 del, 1270 sub ] exp/chain/tdnn1a_sp_online/decode_tgsmall_test_clean/wer_11_0.0 + # This chain system has TDNN+Norm-OPGRU architecture. -exp/chain/tdnn_opgru_1a_sp: num-iters=130 nj=2..12 num-params=37.9M dim=40+100->3000 combine=-0.040->-0.038 (over 6) xent:train/valid[85,129,final]=(-1.12,-0.608,-0.616/-1.21,-0.697,-0.705) logprob:train/valid[85,129,final]=(-0.062,-0.027,-0.027/-0.067,-0.030,-0.030) -%WER 9.33 [ 863 / 9253, 101 ins, 162 del, 600 sub ] exp/chain/tdnn_opgru_1a_sp_online/decode_fglarge_test_clean/wer_8_1.0 -%WER 15.13 [ 1400 / 9253, 154 ins, 217 del, 1029 sub ] exp/chain/tdnn_opgru_1a_sp_online/decode_tgsmall_test_clean/wer_9_0.0 +exp/chain/tdnn_opgru1a_sp: num-iters=99 nj=2..12 num-params=38.0M dim=40+100->3040 combine=-0.045->-0.045 (over 1) xent:train/valid[65,98,final]=(-1.18,-0.663,-0.651/-1.21,-0.698,-0.684) logprob:train/valid[65,98,final]=(-0.079,-0.038,-0.037/-0.076,-0.040,-0.039) +%WER 9.45 [ 874 / 9253, 109 ins, 159 del, 606 sub ] exp/chain/tdnn_opgru1a_sp_online/decode_fglarge_test_clean/wer_10_1.0 +%WER 15.22 [ 1408 / 9253, 175 ins, 196 del, 1037 sub ] exp/chain/tdnn_opgru1a_sp_online/decode_tgsmall_test_clean/wer_8_0.0 diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh index 3809c1cc31c..20ffd6630c4 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh @@ -10,15 +10,14 @@ set -e -o pipefail # ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp # System tdnn1a_sp -#WER test_clean (tgsmall) 19.11 -#WER test_clean (fglarge) 11.06 +#WER test_clean (tgsmall) 18.93 +#WER test_clean (fglarge) 11.08 # Final train prob -0.0527 -# Final valid prob -0.0545 -# Final train prob (xent) -0.8395 -# Final valid prob (xent) -0.8590 +# Final valid prob -0.0541 +# Final train prob (xent) -0.8366 +# Final valid prob (xent) -0.8532 # Num-params 8426432 - # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). stage=0 diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh index 097d9f4f4e9..44110888519 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -10,12 +10,12 @@ set -e -o pipefail # ./local/chain/compare_wer.sh exp/chain/tdnn_opgru1a_sp # System tdnn_opgru1a_sp -#WER test_clean (tgsmall) 15.17 -#WER test_clean (fglarge) 9.14 -# Final train prob -0.0380 -# Final valid prob -0.0378 -# Final train prob (xent) -0.6470 -# Final valid prob (xent) -0.6805 +#WER test_clean (tgsmall) 15.22 +#WER test_clean (fglarge) 9.45 +# Final train prob -0.0373 +# Final valid prob -0.0386 +# Final train prob (xent) -0.6506 +# Final valid prob (xent) -0.6837 # Num-params 37970368 From cb817af6d013a271f0300f4bfb75d1666963fe3b Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Thu, 30 Aug 2018 22:52:19 -0700 Subject: [PATCH 24/26] data prep interface change --- egs/zeroth_korean/s5/local/data_prep.sh | 11 +++++++---- egs/zeroth_korean/s5/run.sh | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/egs/zeroth_korean/s5/local/data_prep.sh b/egs/zeroth_korean/s5/local/data_prep.sh index 5e6a7d02ce6..4fbb727f1cb 100755 --- a/egs/zeroth_korean/s5/local/data_prep.sh +++ b/egs/zeroth_korean/s5/local/data_prep.sh @@ -7,13 +7,16 @@ # Modified by Lucas Jo 2017 (Altas Guide) if [ "$#" -ne 2 ]; then - echo "Usage: $0 " + echo "Usage: $0 " echo "e.g.: $0 ./db/train_data_01 data/train_data_01" exit 1 fi -src=$1 -dst=$2 +db_dir=$1 +data_part=$2 + +src=${db_dir}/${data_part} +dst=data/${data_part} # all utterances are FLAC compressed if ! which flac >&/dev/null; then @@ -21,7 +24,7 @@ if ! which flac >&/dev/null; then exit 1 fi -spk_file=$src/../AUDIO_INFO +spk_file=${db_dir}/AUDIO_INFO mkdir -p $dst || exit 1; diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh index 58db7a93ad9..c5c7506980b 100755 --- a/egs/zeroth_korean/s5/run.sh +++ b/egs/zeroth_korean/s5/run.sh @@ -38,7 +38,7 @@ if [ $stage -le 1 ]; then # format the data as Kaldi data directories for part in train_data_01 test_data_01; do # use underscore-separated names in data directories. - local/data_prep.sh $db_dir/$part data/$part + local/data_prep.sh $db_dir $part done fi From 6259aed7afa33fde2fb1d820f411681b7aa32017 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Thu, 30 Aug 2018 23:01:44 -0700 Subject: [PATCH 25/26] cosmetic fix for ivector script --- .../s5/local/nnet3/run_ivector_common.sh | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh index 7bde9a1ad9b..70be96310e1 100755 --- a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh +++ b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh @@ -55,29 +55,25 @@ if [ $stage -le 3 ]; then steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; done - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We align a subset of training data for - # this purpose. + # We need to build a small system just because we need PCA transform + # to train the diag-UBM on top of. utils/subset_data_dir.sh data/${trainset}_hires 30000 data/train_30k_hires fi if [ $stage -le 4 ]; then - # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 - # because after we get the transform (12th iter is the last), any further - # training is pointless. - + # Train a small system just for its PCA transform. echo "$0: computing a PCA transform from the hires data." mkdir exp -p exp/nnet3 steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ --max-utts 30000 --subsample 2 \ - data/${trainset}_hires exp/nnet3/pca_transform + data/train_30k_hires exp/nnet3/pca_transform fi if [ $stage -le 5 ]; then # To train a diagonal UBM we don't need very much data, so use a small subset - echo "$0: computing a PCA transform from the hires data." + echo "$0: training the diagonal UBM." steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \ data/train_30k_hires 512 exp/nnet3/pca_transform exp/nnet3/diag_ubm fi From 7e14701aeb3ca9814d3e72124766468ed2ddfd89 Mon Sep 17 00:00:00 2001 From: Wonkyum Lee Date: Fri, 31 Aug 2018 09:10:25 -0700 Subject: [PATCH 26/26] increase parameter for TDNN-F --- egs/zeroth_korean/s5/RESULTS | 7 ++- .../s5/local/chain/tuning/run_tdnn_1a.sh | 54 +++++++++---------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/egs/zeroth_korean/s5/RESULTS b/egs/zeroth_korean/s5/RESULTS index 976157fa584..9255ec17673 100644 --- a/egs/zeroth_korean/s5/RESULTS +++ b/egs/zeroth_korean/s5/RESULTS @@ -52,10 +52,9 @@ done exit 0 # tdnn_1a is a kind of factorized TDNN, with skip connections. -exp/chain/tdnn1a_sp: num-iters=174 nj=2..8 num-params=8.4M dim=40+100->3040 combine=-0.049->-0.048 (over 3) xent:train/valid[115,173,final]=(-1.21,-0.841,-0.837/-1.20,-0.856,-0.853) logprob:train/valid[115,173,final]=(-0.091,-0.053,-0.053/-0.084,-0.055,-0.054) -%WER 11.08 [ 1025 / 9253, 155 ins, 155 del, 715 sub ] exp/chain/tdnn1a_sp_online/decode_fglarge_test_clean/wer_11_0.0 -%WER 18.93 [ 1752 / 9253, 209 ins, 273 del, 1270 sub ] exp/chain/tdnn1a_sp_online/decode_tgsmall_test_clean/wer_11_0.0 - +exp/chain/tdnn1b_sp: num-iters=174 nj=2..8 num-params=12.9M dim=40+100->3040 combine=-0.041->-0.041 (over 2) xent:train/valid[115,173,final]=(-1.14,-0.759,-0.751/-1.14,-0.788,-0.777) logprob:train/valid[115,173,final]=(-0.084,-0.047,-0.046/-0.080,-0.050,-0.048) +%WER 10.55 [ 976 / 9253, 122 ins, 166 del, 688 sub ] exp/chain/tdnn1b_sp_online/decode_fglarge_test_clean/wer_13_1.0 +%WER 17.65 [ 1633 / 9253, 208 ins, 233 del, 1192 sub ] exp/chain/tdnn1b_sp_online/decode_tgsmall_test_clean/wer_10_0.0 # This chain system has TDNN+Norm-OPGRU architecture. exp/chain/tdnn_opgru1a_sp: num-iters=99 nj=2..12 num-params=38.0M dim=40+100->3040 combine=-0.045->-0.045 (over 1) xent:train/valid[65,98,final]=(-1.18,-0.663,-0.651/-1.21,-0.698,-0.684) logprob:train/valid[65,98,final]=(-0.079,-0.038,-0.037/-0.076,-0.040,-0.039) diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh index 20ffd6630c4..55e046dd55a 100755 --- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh @@ -6,17 +6,17 @@ set -e -o pipefail # The training recipe is from WSJ example(egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh) # steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp -# exp/chain/tdnn1a_sp: num-iters=174 nj=2..8 num-params=8.4M dim=40+100->3040 combine=-0.049->-0.048 (over 3) xent:train/valid[115,173,final]=(-1.23,-0.838,-0.839/-1.22,-0.863,-0.859) logprob:train/valid[115,173,final]=(-0.091,-0.053,-0.053/-0.087,-0.056,-0.055) +# exp/chain/tdnn1b_sp: num-iters=174 nj=2..8 num-params=12.9M dim=40+100->3040 combine=-0.041->-0.041 (over 2) xent:train/valid[115,173,final]=(-1.14,-0.759,-0.751/-1.14,-0.788,-0.777) logprob:train/valid[115,173,final]=(-0.084,-0.047,-0.046/-0.080,-0.050,-0.048) # ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp -# System tdnn1a_sp -#WER test_clean (tgsmall) 18.93 -#WER test_clean (fglarge) 11.08 -# Final train prob -0.0527 -# Final valid prob -0.0541 -# Final train prob (xent) -0.8366 -# Final valid prob (xent) -0.8532 -# Num-params 8426432 +# System tdnn1b_sp +#WER test_clean (tgsmall) 17.65 +#WER test_clean (fglarge) 10.55 +# Final train prob -0.0460 +# Final valid prob -0.0480 +# Final train prob (xent) -0.7512 +# Final valid prob (xent) -0.7769 +# Num-params 12922560 # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). @@ -174,26 +174,26 @@ if [ $stage -le 11 ]; then fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024 - tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 - tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 - tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 - tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 - tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - linear-component name=prefinal-l dim=192 $linear_opts - - - prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1280 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1280 small-dim=256 output-layer name=output include-log-softmax=false dim=$num_targets $output_opts - prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1280 small-dim=256 output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts EOF