diff --git a/egs/babel_multilang/s5/conf/common.fullLP b/egs/babel_multilang/s5/conf/common.fullLP new file mode 100644 index 00000000000..264e51311b8 --- /dev/null +++ b/egs/babel_multilang/s5/conf/common.fullLP @@ -0,0 +1,99 @@ +# BNF training parameters +bnf_num_hidden_layers=6 +bottleneck_dim=42 +bnf_hidden_layer_dim=2048 +bnf_minibatch_size=512 +bnf_init_learning_rate=0.008 +bnf_final_learning_rate=0.0008 +bnf_max_change=40 +bnf_num_jobs=4 +bnf_num_threads=1 +bnf_mixup=10000 +bnf_mpe_learning_rate=0.00009 +bnf_mpe_last_layer_factor=0.1 +bnf_num_gauss_ubm=550 # use fewer UBM Gaussians than the + # non-bottleneck system (which has 800) +bnf_num_gauss_sgmm=50000 # use fewer SGMM sub-states than the + # non-bottleneck system (which has 80000). +bnf_decode_acwt=0.066666 + + +# DNN hybrid system training parameters +dnn_num_hidden_layers=4 +dnn_input_dim=4000 +dnn_output_dim=400 +dnn_init_learning_rate=0.008 +dnn_final_learning_rate=0.0008 +dnn_mixup=12000 + +dnn_mpe_learning_rate=0.00008 +dnn_mpe_last_layer_factor=0.1 +dnn_mpe_retroactive=true + +bnf_every_nth_frame=2 # take every 2nd frame. +babel_type=full + +use_pitch=true + +lmwt_plp_extra_opts=( --min-lmwt 8 --max-lmwt 12 ) +lmwt_bnf_extra_opts=( --min-lmwt 15 --max-lmwt 22 ) +lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 15 ) + +dnn_beam=16.0 +dnn_lat_beam=8.5 + +icu_opt=(--use-icu true --icu-transform Any-Lower) + +if [[ `hostname` == *.tacc.utexas.edu ]] ; then + decode_extra_opts=( --num-threads 4 --parallel-opts "-pe smp 4" ) + sgmm_train_extra_opts=( ) + sgmm_group_extra_opts=( --num_iters 25 ) + sgmm_denlats_extra_opts=( --num-threads 2 ) + sgmm_mmi_extra_opts=(--cmd "local/lonestar.py -pe smp 2") + dnn_denlats_extra_opts=( --num-threads 2 ) + + dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \ + --parallel-opts "-pe smp 16" ) + dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 8 --num-threads 1) + + dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1) + dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1) + dnn_parallel_opts="-l gpu=1" +else + decode_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6 -l mem_free=4G,ram_free=0.7G") + sgmm_train_extra_opts=( --num-iters 25 ) + sgmm_group_extra_opts=(--group 3 --parallel-opts "-pe smp 3 -l mem_free=7G,ram_free=2.75G" --cmd "queue.pl -l arch=*64 -l mem_free=3.0G,ram_free=3.0G") + sgmm_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=0.8G") + sgmm_mmi_extra_opts=(--cmd "queue.pl -l arch=*64 -l mem_free=3.2G,ram_free=3.2G") + dnn_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=0.8G") + + dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \ + --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G") + dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 8 --num-threads 1 \ + --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G") + dnn_parallel_opts="-l gpu=1" + dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1 \ + --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G") +fi + +icu_transform="Any-Lower" +case_insensitive=true + + +max_states=150000 +wip=0.5 + + +phoneme_mapping= + +minimize=true + +proxy_phone_beam=-1 +proxy_phone_nbest=-1 +proxy_beam=5 +proxy_nbest=500 + +extlex_proxy_phone_beam=5 +extlex_proxy_phone_nbest=300 +extlex_proxy_beam=-1 +extlex_proxy_nbest=-1 diff --git a/egs/babel_multilang/s5/conf/common.limitedLP b/egs/babel_multilang/s5/conf/common.limitedLP new file mode 100644 index 00000000000..49b8fc6ab7c --- /dev/null +++ b/egs/babel_multilang/s5/conf/common.limitedLP @@ -0,0 +1,104 @@ +# BNF training parameters +bnf_num_hidden_layers=5 +bottleneck_dim=42 +bnf_hidden_layer_dim=1024 +bnf_minibatch_size=512 +bnf_init_learning_rate=0.008 +bnf_final_learning_rate=0.0008 +bnf_max_change=40 +bnf_num_jobs=4 +bnf_num_threads=1 +bnf_mixup=5000 +bnf_mpe_learning_rate=0.00009 +bnf_mpe_last_layer_factor=0.1 +bnf_num_gauss_ubm=500 # use fewer UBM Gaussians than the + # non-bottleneck system (which has 750) +bnf_num_gauss_sgmm=10000 # use fewer SGMM sub-states than the + # non-bottleneck system (which has 18000). +bnf_decode_acwt=0.066666 + + +## DNN hybrid system training parameters +dnn_num_hidden_layers=3 +dnn_input_dim=2000 +dnn_output_dim=200 +dnn_init_learning_rate=0.008 +dnn_final_learning_rate=0.0008 +dnn_mixup=5000 + +dnn_mpe_learning_rate=0.00009 +dnn_mpe_last_layer_factor=0.1 +dnn_mpe_retroactive=true + +bnf_every_nth_frame=1 # take all frames. +babel_type=limited + +use_pitch=true + +lmwt_plp_extra_opts=( --min-lmwt 8 --max-lmwt 12 ) +lmwt_bnf_extra_opts=( --min-lmwt 15 --max-lmwt 22 ) +lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 15 ) + +dnn_beam=16.0 +dnn_lat_beam=8.5 + +icu_opt=(--use-icu true --icu-transform Any-Lower) + +# Semi-supervised examples options +dnn_update_egs_opts=(--weight-threshold 0.7 --splice-width 4 --samples-per-iter 200000 --num-jobs-nnet 4 --io-opts "-tc 5" ) + +if [[ `hostname` == *.tacc.utexas.edu ]] ; then + decode_extra_opts=( --num-threads 4 --parallel-opts "-pe smp 4" ) + sgmm_train_extra_opts=( --num-iters 25 ) + sgmm_group_extra_opts=( ) + sgmm_denlats_extra_opts=( --num-threads 1 ) + dnn_denlats_extra_opts=( --num-threads 1 ) + + dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \ + --parallel-opts "-pe smp 16" ) + dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 4 --num-threads 1 + --parallel-opts "-pe smp 16" ) + + dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 4 --num-threads 1) + + dnn_update_parallel_opts=( --num-epochs 15 --num-epochs-extra 5 --num-iters-final 20 ) +else + decode_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6 -l mem_free=4G,ram_free=4.0G") + sgmm_train_extra_opts=( --num-iters 25 ) + sgmm_group_extra_opts=(--group 3 --parallel-opts "-pe smp 3 -l mem_free=7G,ram_free=7.0G" --cmd "queue.pl -l arch=*64 -l mem_free=2.0G,ram_free=2.0G") + sgmm_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2.0G") + sgmm_mmi_extra_opts=(--cmd "queue.pl -l arch=*64 -l mem_free=1.5G,ram_free=1.5G") + dnn_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2.0G") + + dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \ + --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2G") + dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 4 --num-threads 1 \ + --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2G") + dnn_parallel_opts="-l gpu=1" + dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 4 --num-threads 1 \ + --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2G") + + dnn_update_parallel_opts=( --num-epochs 15 --num-epochs-extra 5 --num-iters-final 20 ) +fi + +icu_transform="Any-Lower" +case_insensitive=true + + +max_states=150000 +wip=0.5 + + +phoneme_mapping= + +minimize=true + +proxy_phone_beam=-1 +proxy_phone_nbest=-1 +proxy_beam=5 +proxy_nbest=500 + +extlex_proxy_phone_beam=5 +extlex_proxy_phone_nbest=300 +extlex_proxy_beam=-1 +extlex_proxy_nbest=-1 diff --git a/egs/babel_multilang/s5/conf/common.semisupervised.limitedLP b/egs/babel_multilang/s5/conf/common.semisupervised.limitedLP new file mode 120000 index 00000000000..85955be6954 --- /dev/null +++ b/egs/babel_multilang/s5/conf/common.semisupervised.limitedLP @@ -0,0 +1 @@ +../../../babel/s5c/conf/common.semisupervised.limitedLP \ No newline at end of file diff --git a/egs/babel_multilang/s5/conf/common_vars.sh b/egs/babel_multilang/s5/conf/common_vars.sh new file mode 100644 index 00000000000..4c285f60ce5 --- /dev/null +++ b/egs/babel_multilang/s5/conf/common_vars.sh @@ -0,0 +1,21 @@ +#keyword search default +glmFile=conf/glm +duptime=0.5 +case_insensitive=false +use_pitch=true +# Lexicon and Language Model parameters +oovSymbol="" +lexiconFlags="-oov " +boost_sil=1.5 # note from Dan: I expect 1.0 might be better (equivalent to not + # having the option)... should test. +cer=0 + +#Declaring here to make the definition inside the language conf files more +# transparent and nice +declare -A dev10h_more_kwlists +declare -A dev2h_more_kwlists +declare -A eval_more_kwlists +declare -A shadow_more_kwlists + +[ -f ./path.sh ] && . ./path.sh; # source the path. +[ -f ./cmd.sh ] && . ./cmd.sh; # source train and decode cmds. diff --git a/egs/babel_multilang/s5/conf/decode.config b/egs/babel_multilang/s5/conf/decode.config new file mode 100644 index 00000000000..e69de29bb2d diff --git a/egs/babel_multilang/s5/conf/decode_dnn.config b/egs/babel_multilang/s5/conf/decode_dnn.config new file mode 100644 index 00000000000..e69de29bb2d diff --git a/egs/babel_multilang/s5/conf/glm b/egs/babel_multilang/s5/conf/glm new file mode 120000 index 00000000000..54a69f7d856 --- /dev/null +++ b/egs/babel_multilang/s5/conf/glm @@ -0,0 +1 @@ +../../../babel/s5c/conf/glm \ No newline at end of file diff --git a/egs/babel_multilang/s5/conf/lang b/egs/babel_multilang/s5/conf/lang new file mode 120000 index 00000000000..efc3224fa69 --- /dev/null +++ b/egs/babel_multilang/s5/conf/lang @@ -0,0 +1 @@ +../../../babel/s5c/conf/lang \ No newline at end of file diff --git a/egs/babel_multilang/s5/conf/mfcc.conf b/egs/babel_multilang/s5/conf/mfcc.conf new file mode 100644 index 00000000000..af5f9c081bc --- /dev/null +++ b/egs/babel_multilang/s5/conf/mfcc.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) + + diff --git a/egs/babel_multilang/s5/conf/mfcc_hires.conf b/egs/babel_multilang/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..e7888d44a0b --- /dev/null +++ b/egs/babel_multilang/s5/conf/mfcc_hires.conf @@ -0,0 +1,11 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) + diff --git a/egs/babel_multilang/s5/conf/online_cmvn.conf b/egs/babel_multilang/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/babel_multilang/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/babel_multilang/s5/conf/pitch.conf b/egs/babel_multilang/s5/conf/pitch.conf new file mode 100644 index 00000000000..926bcfca92a --- /dev/null +++ b/egs/babel_multilang/s5/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=8000 diff --git a/egs/babel_multilang/s5/conf/plp.conf b/egs/babel_multilang/s5/conf/plp.conf new file mode 100644 index 00000000000..926bcfca92a --- /dev/null +++ b/egs/babel_multilang/s5/conf/plp.conf @@ -0,0 +1 @@ +--sample-frequency=8000 diff --git a/egs/babel_multilang/s5/conf/queue.conf b/egs/babel_multilang/s5/conf/queue.conf new file mode 100644 index 00000000000..2b2c354d5e2 --- /dev/null +++ b/egs/babel_multilang/s5/conf/queue.conf @@ -0,0 +1,10 @@ +# configuration for the AWS cluster for WS'15. +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 +option gpu=1 -q g.q@b* -l gpu=1 diff --git a/egs/babel_multilang/s5/local/nnet3/extract_ivector_lang.sh b/egs/babel_multilang/s5/local/nnet3/extract_ivector_lang.sh new file mode 100755 index 00000000000..be6a8c700f3 --- /dev/null +++ b/egs/babel_multilang/s5/local/nnet3/extract_ivector_lang.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# This scripts extract iVector using global iVector extractor +# trained on all languages in multilingual setup. + +. ./cmd.sh +set -e +stage=1 +train_set=train +global_extractor=exp/multi/nnet3/extractor +ivector_suffix=_gb + +[ ! -f ./conf/common_vars.sh ] && echo 'the file conf/common_vars.sh does not exist!' && exit 1 + +. conf/common_vars.sh || exit 1; + +[ -f local.conf ] && . ./local.conf + +. ./utils/parse_options.sh + +lang=$1 + +mkdir -p nnet3 + +if [ $stage -le 8 ]; then + # We extract iVectors on all the train_nodup data, which will be what we + # train the system on. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/$lang/${train_set}_hires data/$lang/${train_set}_max2_hires + + if [ ! -f exp/$lang/nnet3/ivectors_${train_set}${ivector_suffix}/ivector_online.scp ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 200 \ + data/$lang/${train_set}_max2_hires $global_extractor exp/$lang/nnet3/ivectors_${train_set}${ivector_suffix} || exit 1; + fi + +fi + + +exit 0; diff --git a/egs/babel_multilang/s5/local/nnet3/prepare_multilingual_egs.sh b/egs/babel_multilang/s5/local/nnet3/prepare_multilingual_egs.sh new file mode 100755 index 00000000000..d53faecee6a --- /dev/null +++ b/egs/babel_multilang/s5/local/nnet3/prepare_multilingual_egs.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# +# This script generates separate egs directory for each input +# language in multilingual setup, which contains both egs.*.ark and egs.*.scp. +# +# This script will generally be called from nnet training script. + +echo "$0 $@" # Print the command line for logging +. ./cmd.sh +set -e + + +# Begin configuration section +cmd= +stage=0 +left_context=13 +right_context=9 +online_multi_ivector_dirs= # list of iVector dir for all languages + # can be used if we are including speaker information as iVectors. + # e.g. "exp/lang1/train-ivector exp/lang2/train-ivector" +samples_per_iter=400000 # this is the target number of egs in each archive of egs + # (prior to merging egs). We probably should have called + # it egs_per_iter. This is just a guideline; it will pick + # a number that divides the number of samples in the + # entire data. +# Configuration to allocate egs +minibatch_size=512 +num_archives=100 +num_jobs=10 +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -lt 4 ]; then + echo "Usage: $0 [opts] num-input-langs " + echo " e.g.: $0 2 data/lang1/train data/lang2/train " + " exp/lang1/tri5_ali exp/lang2/tri5_ali exp/lang1/nnet3/egs exp/lang2/nnet3/egs exp/multi/egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --num-jobs # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --samples-per-iter <#samples;400000> # Target number of egs per archive (option is badly named)" + echo " --frames-per-eg # number of frames per eg on disk" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --num-frames-diagnostic <#frames;4000> # Number of frames used in computing (train,valid) diagnostics" + echo " --num-valid-frames-combine <#frames;10000> # Number of frames used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +num_lang=$1 +shift +args=("$@") + +if [ ${#args[@]} != $[$num_lang*3] ]; then + echo "$0: num of input dirs provided for all langs is not compatible with num-langs in input." && exit 1; +fi + +# read input data, ali and egs dir per lang +for l in `seq 0 $[$num_lang-1]`; do + multi_data_dirs[$l]=${args[$l]} + multi_ali_dirs[$l]=${args[$l+$num_lang]} + multi_egs_dirs[$l]=${args[$l+2*$num_lang]} +done + +echo "$0: Generate separate egs directory per language for multilingual training." +online_multi_ivector_dirs=(${online_multi_ivector_dirs[@]}) +for lang_index in `seq 0 $[$num_lang-1]`; do + data=${multi_data_dirs[$lang_index]} + ali_dir=${multi_ali_dirs[$lang_index]} + egs_dir=${multi_egs_dirs[$lang_index]} + online_ivector_dir= + if [ ! -z "${online_multi_ivector_dirs[$lang_index]}" ]; then + online_ivector_dir=${online_multi_ivector_dirs[$lang_index]} + fi + echo online_ivector_dir = $online_ivector_dir + if [ ! -d "$egs_dir" ]; then + echo "$0: Generate egs for ${lang_list[$lang_index]}" + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/${lang_list[$lang_index]}-$(date +'%m_%d_%H_%M')/s5/$egs_dir/storage $egs_dir/storage + fi + + extra_opts=() + [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") + [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) + extra_opts+=(--left-context $left_context) + extra_opts+=(--right-context $right_context) + echo "$0: calling get_egs.sh" + steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \ + --samples-per-iter $samples_per_iter --stage $stage \ + --cmd "$cmd" $egs_opts \ + --generate-egs-scp true \ + $data $ali_dir $egs_dir || exit 1; + + fi +done + diff --git a/egs/babel_multilang/s5/local/nnet3/run_common_langs.sh b/egs/babel_multilang/s5/local/nnet3/run_common_langs.sh new file mode 100755 index 00000000000..ca9e8517b44 --- /dev/null +++ b/egs/babel_multilang/s5/local/nnet3/run_common_langs.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# This script uses to generated MFCC+pitch features for input language L. + +. ./cmd.sh +set -e +stage=1 +train_stage=-10 +generate_alignments=true # If true, it regenerates alignments. +speed_perturb=true +use_pitch=true # If true, it generates pitch features and combine it with 40dim MFCC. +pitch_conf=conf/pitch.conf # Configuration used for pitch extraction. +use_pitch_plp=false # If true, it generated plp+pitch to be used in regenerating alignments. + +[ ! -f ./conf/common_vars.sh ] && echo 'the file conf/common_vars.sh does not exist!' && exit 1 + +. conf/common_vars.sh || exit 1; + +[ -f local.conf ] && . ./local.conf + +. ./utils/parse_options.sh + +lang=$1 + +# perturbed data preparation +train_set=train +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + for datadir in train; do + ./utils/data/perturb_data_dir_speed_3way.sh data/$lang/${datadir} data/$lang/${datadir}_sp + + # Extract Plp+pitch feature for perturbed data. + featdir=plp_perturbed/$lang + if $use_pitch_plp; then + steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $train_nj data/$lang/${datadir}_sp exp/$lang/make_plp_pitch/${datadir}_sp $featdir + else + steps/make_plp.sh --cmd "$train_cmd" --nj $train_nj data/$lang/${datadir}_sp exp/$lang/make_plp/${datadir}_sp $featdir + fi + steps/compute_cmvn_stats.sh data/$lang/${datadir}_sp exp/$lang/make_plp/${datadir}_sp $featdir || exit 1; + utils/fix_data_dir.sh data/$lang/${datadir}_sp + done + fi + + train_set=train_sp + if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ] && [ ! -f exp/$lang/tri5_ali_sp/.done ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh \ + --nj 70 --cmd "$train_cmd" \ + --boost-silence $boost_sil \ + data/$lang/$train_set data/$lang/lang exp/$lang/tri5 exp/$lang/tri5_ali_sp || exit 1 + touch exp/$lang/tri5_ali_sp/.done + fi +fi + +if [ $stage -le 3 ] && [ ! -f data/$lang/${train_set}_hires/.done ]; then + mfccdir=mfcc_hires/$lang + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/$lang-$date/s5c/$mfccdir/storage $mfccdir/storage + fi + + for dataset in $train_set ; do + utils/copy_data_dir.sh data/$lang/$dataset data/$lang/${dataset}_hires + + # scale the waveforms, this is useful as we don't use CMVN + data_dir=data/$lang/${dataset}_hires + + utils/data/perturb_data_dir_volume.sh $data_dir || exit 1 ; + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/$lang/${dataset}_hires exp/$lang/make_hires/$dataset $mfccdir; + + steps/compute_cmvn_stats.sh data/$lang/${dataset}_hires exp/$lang/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/$lang/${dataset}_hires; + done + touch data/$lang/${train_set}_hires/.done +fi + +if [ $stage -le 4 ]; then + if [[ "$use_pitch" == "true" ]]; then + pitchdir=pitch/$lang + train_set=${train_set}_hires + for dataset in $train_set; do + if $use_pitch; then + mkdir -p $pitchdir + if [ ! -f data/$lang/${dataset}_pitch/feats.scp ]; then + echo "$0: Generating pitch features for data/$lang as use_pitch=$use_pitch" + utils/copy_data_dir.sh data/$lang/$dataset data/$lang/${dataset}_pitch + steps/make_pitch.sh --nj 70 --pitch-config $pitch_conf \ + --cmd "$train_cmd" data/$lang/${dataset}_pitch exp/$lang/make_pitch/${dataset} $pitchdir; + fi + feat_suffix=_pitch + fi + + if [ ! -f data/$lang/${dataset}_mfcc${feat_suffix}/feats.scp ]; then + steps/append_feats.sh --nj 16 --cmd "$train_cmd" data/$lang/${dataset} \ + data/$lang/${dataset}${feat_suffix} data/$lang/${dataset}_mfcc${feat_suffix} \ + exp/$lang/append_mfcc${feat_suffix}/${dataset} mfcc${feat_suffix}/$lang + + steps/compute_cmvn_stats.sh data/$lang/${dataset}_mfcc${feat_suffix} exp/$lang/make_cmvn_mfcc${feat_suffix}/${x} mfcc${feat_suffix}/$lang + fi + done + fi +fi + +exit 0; diff --git a/egs/babel_multilang/s5/local/nnet3/run_multilingual_bnf.sh b/egs/babel_multilang/s5/local/nnet3/run_multilingual_bnf.sh new file mode 100755 index 00000000000..8a896778446 --- /dev/null +++ b/egs/babel_multilang/s5/local/nnet3/run_multilingual_bnf.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# This script trains a multilingual model using 6 layer TDNN + Xent +# with 42 dim bottleneck layer in fifth layer for Georgian. +# The lang_list contains 10 closest fullLP langs to Georgian + fullLP Georgian. +# Then it extracts bottleneck features for input language "lang" and +# train SAT model using these feautures. + +# Copyright 2016 Pegah Ghahremani +# Apache 2.0 + +#This yields approx 70 hours of data +# this script generates bottleneck features from multilingual model +# trained on list of languages and dump the bnf for specific language L. +set -e #Exit on non-zero return code from any command +set -o pipefail #Exit if any of the commands in the pipeline will + #return non-zero return code +. conf/common_vars.sh || exit 1; + +set -u #Fail on an undefined variable +bnf_train_stage=-100 # the stage variable used in multilingual bottleneck training. +stage=1 +num_archives=20 +speed_perturb=true +multidir=exp/nnet3/multi_bnf_10_close_lang_plus_grg +global_extractor=exp/multi/nnet3/extractor +lang_list=(404-georgian 403-dholuo 402-javanese 401-mongolian 307-amharic) +use_flp=true + +. ./utils/parse_options.sh + + +lang=$1 +. local/prepare_lang_conf.sh --fullLP $use_flp $lang || exit 1; + +if $use_flp; then +. local/prepare_flp_langconf.sh $lang +else +. local/prepare_llp_langconf.sh $lang +fi + +langconf=langconf/$lang/lang.conf +[ ! -f $langconf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1; +. $langconf || exit 1; + +suffix= +if $speed_perturb; then + suffix=_sp +fi + +exp_dir=exp/$lang +datadir=data/$lang/train${suffix}_hires_mfcc_pitch +appended_dir=data/$lang/train${suffix}_hires_mfcc_pitch_bnf +data_bnf_dir=data/$lang/train${suffix}_bnf +dump_bnf_dir=bnf/$lang +ivector_dir=$exp_dir/nnet3/ivectors_train${suffix}_gb +############################################################################### +# +# Training multilingual model with bottleneck layer +# +############################################################################### +mkdir -p $multidir${suffix} + +if [ ! -f $multidir${suffix}/.done ]; then + echo "$0: Train multilingual Bottleneck network using lang list = ${lang_list[@]}" + ./local/nnet3/run_tdnn_joint_babel_sp_bnf.sh --dir $multidir \ + --avg-num-archives $num_archives \ + --global-extractor $global_extractor \ + --train-stage $bnf_train_stage --stage $stage || exit 1; + + touch $multidir${suffix}/.done +else + echo "$0 Skip multilingual Bottleneck network training; you can force to run this step by deleting $multidir${suffix}/.done" +fi + +[ ! -d $dump_bnf_dir ] && mkdir -p $dump_bnf_dir +if [ ! -f $data_bnf_dir/.done ]; then + multidir=$multidir${suffix} + mkdir -p $dump_bnf_dir + # put the archives in ${dump_bnf_dir}/. + steps/nnet3/make_bottleneck_features.sh --use-gpu true --nj 70 --cmd "$train_cmd" \ + --ivector-dir $ivector_dir \ + --bnf-name Tdnn_Bottleneck_renorm \ + $datadir $data_bnf_dir \ + $multidir $dump_bnf_dir $exp_dir/make_train_bnf || exit 1; + touch $data_bnf_dir/.done +else + echo "$0 Skip Bottleneck feature extraction; You can force to run this step deleting $data_bnf_dir/.done." +fi + +if [ ! -d $appended_dir/.done ]; then + steps/append_feats.sh --cmd "$train_cmd" --nj 4 \ + $data_bnf_dir $datadir $appended_dir \ + $exp_dir/append_hires_mfcc_bnf $dump_bnf_dir || exit 1; + steps/compute_cmvn_stats.sh $appended_dir \ + $exp_dir/make_cmvn_mfcc_bnf $dump_bnf_dir || exit 1; + touch $appended_dir/.done +fi + +if [ ! $exp_dir/tri5b/.done -nt $data_bnf_dir/.done ]; then + steps/train_lda_mllt.sh --splice-opts "--left-context=1 --right-context=1" \ + --dim 60 --boost-silence $boost_sil --cmd "$train_cmd" \ + $numLeavesMLLT $numGaussMLLT $appended_dir data/$lang/lang $exp_dir/tri5_ali_sp $exp_dir/tri5b ; + touch $exp_dir/tri5b/.done +fi + +if [ ! $exp_dir/tri6/.done -nt $exp_dir/tri5b/.done ]; then + steps/train_sat.sh --boost-silence $boost_sil --cmd "$train_cmd" \ + $numLeavesSAT $numGaussSAT $appended_dir data/$lang/lang \ + $exp_dir/tri5b $exp_dir/tri6 + touch $exp_dir/tri6/.done +fi + +echo --------------------------------------------------------------------- +echo "$0: next, run run-6-bnf-sgmm-semisupervised.sh" +echo --------------------------------------------------------------------- + +exit 0; diff --git a/egs/babel_multilang/s5/local/nnet3/run_tdnn_joint_babel_sp_bnf.sh b/egs/babel_multilang/s5/local/nnet3/run_tdnn_joint_babel_sp_bnf.sh new file mode 100755 index 00000000000..a645b4c2193 --- /dev/null +++ b/egs/babel_multilang/s5/local/nnet3/run_tdnn_joint_babel_sp_bnf.sh @@ -0,0 +1,235 @@ +#!/bin/bash + +# This script can be used for training multilingual setup using different +# languages (specifically babel languages) with no shared phones. +# It will generate separate egs directory for each dataset and combine them +# during training. +# In the new multilingual training setup, mini-batches of data corresponding to +# different languages are randomly sampled during training based on probability +# distribution that reflects the relative frequency of the data from each language. + +# For all languages, we share all the hidden layers and there is separate final +# layer per language. +# The bottleneck layer can be added to network structure. + +# The script requires you to have baseline PLP features for all languages. +# It generates 40dim MFCC + pitch features for all languages. + +# The global iVector extractor is trained using all languages and the iVector +# extracts for all languages. + +echo "$0 $@" # Print the command line for logging +. ./cmd.sh +set -e + + +cmd=queue.pl +stage=0 +train_stage=-10 +get_egs_stage=-10 +decode_stage=-10 +num_jobs_initial=2 +num_jobs_final=8 +speed_perturb=true +use_pitch=true +global_extractor=exp/multi/nnet3/extractor +alidir=tri5_ali +suffix= +use_ivector=true +feat_suffix=_hires_mfcc # The feature suffix describing features used in multilingual training + # _hires_mfcc -> 40dim MFCC + # _hire_mfcc_pitch -> 40dim MFCC + pitch + # _hires_mfcc_pitch_bnf -> 40dim MFCC +pitch + BNF +# corpora +# language list used for multilingual training +# The map for lang-name to its abreviation can be find in +# local/prepare_lang_conf.sh +# e.g lang_list=(101-cantonese 102-assamese 103-bengali) +lang_list= +# The language in this list decodes using Hybrid multilingual system. +# e.g. decode_lang_list=(101-cantonese) +decode_lang_list= + +dir=exp/nnet3/multi_bnf +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0 0" + +ivector_suffix=_gb # if ivector_suffix = _gb, the iVector extracted using global iVector extractor + # trained on pooled data from all languages. + # Otherwise, it uses iVector extracted using local iVector extractor. + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +[ -f local.conf ] && . ./local.conf + +num_langs=${#lang_list[@]} + +echo "$0 $@" # Print the command line for logging +if ! cuda-compiled; then + cat </dev/null | grep num-pdfs | awk '{print $2}'` || exit 1; + num_multiple_leaves="$num_multiple_leaves $num_leaves" + multi_data_dirs[$lang_index]=data/${lang_list[$lang_index]}/train${suffix}${feat_suffix} + multi_egs_dirs[$lang_index]=exp/${lang_list[$lang_index]}/nnet3/egs${ivector_suffix} + multi_ali_dirs[$lang_index]=exp/${lang_list[$lang_index]}/tri5_ali${suffix} + multi_ivector_dirs[$lang_index]=exp/${lang_list[$lang_index]}/nnet3/ivectors_train${suffix}${ivector_suffix} +done + +if $use_ivector; then + ivector_dim=$(feat-to-dim scp:${multi_ivector_dirs[0]}/ivector_online.scp -) || exit 1; + echo ivector-dim = $ivector_dim +else + echo "$0: Not using iVectors in multilingual training." + ivector_dim=0 +fi + +feat_dim=`feat-to-dim scp:${multi_data_dirs[0]}/feats.scp -` + + +if [ $stage -le 9 ]; then + mkdir -p $dir/log + echo "$0: creating neural net config for multilingual setups" + # create the config files for nnet initialization + $cmd $dir/log/make_config.log \ + python steps/nnet3/tdnn/make_configs.py \ + --splice-indexes "$splice_indexes" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + --relu-dim 600 \ + --num-multiple-targets "$num_multiple_leaves" \ + --bottleneck-dim 42 --bottleneck-layer 5 \ + --use-presoftmax-prior-scale false \ + --add-lda false \ + $dir/configs || exit 1; + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + $cmd $dir/log/nnet_init.log \ + nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; +fi + +if [ $stage -le 10 ]; then + echo "$0: Generate separate egs dir per language for multilingual training." + # sourcing the "vars" below sets + #model_left_context=(something) + #model_right_context=(something) + #num_hidden_layers=(something) + . $dir/configs/vars || exit 1; + + + ivec="${multi_ivector_dirs[@]}" + if $use_ivector; then + ivector_opts=(--online-multi-ivector-dirs "$ivec") + fi + local/nnet3/prepare_multilingual_egs.sh --cmd "$decode_cmd" \ + "${ivector_opts[@]}" \ + --left-context $model_left_context --right-context $model_right_context \ + --samples-per-iter 400000 \ + $num_langs ${multi_data_dirs[@]} ${multi_ali_dirs[@]} ${multi_egs_dirs[@]} || exit 1; +fi + +if [ $stage -le 11 ]; then + echo "$0: training mutilingual model." + common_egs_dir="${multi_egs_dirs[@]} $dir/egs" + echo common_egs_dir = $common_egs_dir + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --use-dense-target false \ + --trainer.num-epochs 2 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0017 \ + --trainer.optimization.final-effective-lrate 0.00017 \ + --feat-dir ${multi_data_dirs[0]} \ + --feat.online-ivector-dir ${multi_ivector_dirs[0]} \ + --egs.dir "${common_egs_dir[@]}" \ + --cleanup.remove-egs false \ + --cleanup.preserve-model-interval 20 \ + --use-gpu true \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +# decoding different languages +if [ $stage -le 12 ]; then + num_decode_lang=${#decode_lang_list[@]} + ( + for lang in `seq 0 $[$num_decode_lang-1]`; do + if [ ! -f $dir/${decode_lang_list[$lang]}/decode_dev10h.pem/.done ]; then + cp $dir/cmvn_opts $dir/${decode_lang_list[$lang]}/. + echo "Decoding lang ${decode_lang_list[$lang]} using multilingual hybrid model $dir" + run-4-anydecode-langs.sh --use-ivector $use_ivector --nnet3-dir $dir ${decode_lang_list[$lang]} || exit 1; + touch $dir/${decode_lang_list[$lang]}/decode_dev10h.pem/.done + fi + done + wait + ) +fi diff --git a/egs/babel_multilang/s5/local/prepare_lang_conf.sh b/egs/babel_multilang/s5/local/prepare_lang_conf.sh new file mode 100755 index 00000000000..18eceaa9403 --- /dev/null +++ b/egs/babel_multilang/s5/local/prepare_lang_conf.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# This script maps lang-name to its config w.r.t fullLP or limitedLP condition. + +fullLP=true +. ./utils/parse_options.sh + +if [ $# -ne 1 ]; then + echo "Usage: $(basename $0) " + echo " e.g.: $(basename $0) ASM" + exit 1 +fi + +L=$1 +echo L = $L and fullLP = $fullLP +if $fullLP; then + lang_type=-fullLP + lang_type2=.FLP +else + lang_type=-limitedLP + lang_type2=.LLP +fi + +case "$L" in + 101-cantonese) + langconf=conf/lang/101-cantonese${lang_type}.official.conf + ;; + 102-assamese) + langconf=conf/lang/102-assamese${lang_type}.official.conf + ;; + 103-bengali) + langconf=conf/lang/103-bengali${lang_type}.official.conf + ;; + 104-pashto) + langconf=conf/lang/104-pashto${lang_type}.official.conf + ;; + 105-turkish) + langconf=conf/lang/105-turkish${lang_type}.official.conf + ;; + 106-tagalog) + langconf=conf/lang/106-tagalog${lang_type}.official.conf + ;; + 107-vietnamese) + langconf=conf/lang/107-vietnamese${lang_type}.official.conf + ;; + 201-haitian) + langconf=conf/lang/201-haitian${lang_type}.official.conf + ;; + 202-swahili) + langconf=conf/lang/202-swahili${lang_type}.official.conf + ;; + 203-lao) + langconf=conf/lang/203-lao${lang_type}.official.conf + ;; + 204-tamil) + langconf=conf/lang/204-tamil${lang_type}.official.conf + ;; + 205-kurmanji) + langconf=conf/lang/205-kurmanji${lang_type2}.official.conf + ;; + 206-zulu) + langconf=conf/lang/206-zulu-${lang_type}.official.conf + ;; + 207-tokpisin) + langconf=conf/lang/207-tokpisin${lang_type2}.official.conf + ;; + 301-cebuano) + langconf=conf/lang/301-cebuano${lang_type2}.official.conf + ;; + 302-kazakh) + langconf=conf/lang/302-kazakh${lang_type2}.official.conf + ;; + 303-telugu) + langconf=conf/lang/303-telugu${lang_type2}.official.conf + ;; + 304-lithuanian) + langconf=conf/lang/304-lithuanian${lang_type2}.official.conf + ;; + 305-guarani) + langconf=conf/lang/305-guarani${lang_type2}.official.conf + ;; + 306-igbo) + langconf=conf/lang/306-igbo${lang_type2}.official.conf + ;; + 307-amharic) + langconf=conf/lang/307-amharic${lang_type2}.official.conf + ;; + 401-mongolian) + langconf=conf/lang/401-mongolian${lang_type2}.official.conf + ;; + 402-javanese) + langconf=conf/lang/402-javanese${lang_type2}.official.conf + ;; + 403-dholuo) + langconf=conf/lang/403-dholuo${lang_type2}.official.conf + ;; + 404-georgian) + langconf=conf/lang/404-georgian.FLP.official.conf + ;; + *) + echo "Unknown language code $L." && exit 1 +esac + +mkdir -p langconf/$L +rm -rf langconf/$L/* +cp $langconf langconf/$L/lang.conf + diff --git a/egs/babel_multilang/s5/run-4-anydecode-langs.sh b/egs/babel_multilang/s5/run-4-anydecode-langs.sh new file mode 100755 index 00000000000..3e13e5eb3a6 Binary files /dev/null and b/egs/babel_multilang/s5/run-4-anydecode-langs.sh differ diff --git a/egs/babel_multilang/s5/utils b/egs/babel_multilang/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/babel_multilang/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index 9b9ce4a54ad..0b85012e7d0 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -26,12 +26,16 @@ def GetSumDescriptor(inputs): return sum_descriptors # adds the input nodes and returns the descriptor -def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): +def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0, idct_mat = None): components = config_lines['components'] component_nodes = config_lines['component-nodes'] output_dim = 0 components.append('input-node name=input dim=' + str(feat_dim)) - list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes] + prev_layer_output = {'descriptor': "input", + 'dimension': feat_dim} + if idct_mat is not None: + prev_layer_output = AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, idct_mat) + list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes] output_dim += len(splice_indexes) * feat_dim if ivector_dim > 0: components.append('input-node name=ivector dim=' + str(ivector_dim)) @@ -158,6 +162,11 @@ def AddConvolutionLayer(config_lines, name, input, else: conv_init_string += " num-filters={0}".format(num_filters) + if param_stddev is not None: + conv_init_string += " param-stddev={0}".format(param_stddev) + if bias_stddev is not None: + conv_init_string += " bias-stddev={0}".format(bias_stddev) + components.append(conv_init_string) component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor'])) @@ -448,4 +457,4 @@ def AddBLstmLayer(config_lines, 'descriptor': output_descriptor, 'dimension':output_dim } - + diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index 79bfc25fff6..b00f0d10102 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -39,7 +39,7 @@ num_utts_subset=300 # number of utterances in validation and training num_valid_frames_combine=0 # #valid frames for combination weights at the very end. num_train_frames_combine=10000 # # train frames for the above. num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs -samples_per_iter=400000 # this is the target number of egs in each archive of egs +samples_per_iter=40000 # this is the target number of egs in each archive of egs # (prior to merging egs). We probably should have called # it egs_per_iter. This is just a guideline; it will pick # a number that divides the number of samples in the @@ -56,6 +56,7 @@ online_ivector_dir= # can be used if we are including speaker information as iV cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +generate_egs_scp=false # If true, it will generate egs.JOB.*.scp per egs archive echo "$0 $@" # Print the command line for logging @@ -294,23 +295,37 @@ if [ $stage -le 3 ]; then wait; [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 echo "... Getting subsets of validation examples for diagnostics and combination." + if $generate_egs_scp; then + valid_combine_output="ark,scp:$dir/valid_combine.egs,$dir/valid_combine.egs.scp" + valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.egs,$dir/valid_diagnostic.egs.scp" + train_combine_output="ark,scp:$dir/train_combine.egs,$dir/train_combine.egs.scp" + train_diagnostic_output="ark,scp:$dir/train_diagnostic.egs,$dir/train_diagnostic.egs.scp" + else + valid_combine_output="ark:$dir/valid_combine.egs" + valid_diagnostic_output="ark:$dir/valid_diagnostic.egs" + train_combine_output="ark:$dir/train_combine.egs" + train_diagnostic_output="ark:$dir/train_diagnostic.egs" + fi $cmd $dir/log/create_valid_subset_combine.log \ nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \ - ark:$dir/valid_combine.egs || touch $dir/.error & + $valid_combine_output || touch $dir/.error & $cmd $dir/log/create_valid_subset_diagnostic.log \ nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \ - ark:$dir/valid_diagnostic.egs || touch $dir/.error & + $valid_diagnostic_output || touch $dir/.error & $cmd $dir/log/create_train_subset_combine.log \ nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \ - ark:$dir/train_combine.egs || touch $dir/.error & + $train_combine_output || touch $dir/.error & $cmd $dir/log/create_train_subset_diagnostic.log \ nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \ - ark:$dir/train_diagnostic.egs || touch $dir/.error & + $train_diagnostic_output || touch $dir/.error & wait sleep 5 # wait for file system to sync. cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs - + if $generate_egs_scp; then + cat $dir/valid_combine.egs.scp $dir/train_combine.egs.scp > $dir/combine.egs.scp + rm $dir/{train,valid}_combine.egs.scp + fi for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do [ ! -s $f ] && echo "No examples in file $f" && exit 1; done @@ -345,15 +360,32 @@ if [ $stage -le 5 ]; then done if [ $archives_multiple == 1 ]; then # normal case. + if $generate_egs_scp; then + output_archive="ark,scp:$dir/egs.JOB.ark,$dir/egs.JOB.scp" + else + output_archive="ark:$dir/egs.JOB.ark" + fi $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:$dir/egs.JOB.ark || exit 1; + nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" $output_archive || exit 1; + + if $generate_egs_scp; then + #concatenate egs.JOB.scp in single egs.scp + rm -rf $dir/egs.scp + for j in $(seq $num_archives_intermediate); do + cat $dir/egs.$j.scp || exit 1; + done > $dir/egs.scp || exit 1; + fi else # we need to shuffle the 'intermediate archives' and then split into the # final archives. we create soft links to manage this splitting, because # otherwise managing the output names is quite difficult (and we don't want # to submit separate queue jobs for each intermediate archive, because then # the --max-jobs-run option is hard to enforce). - output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)" + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/egs.JOB.$y.ark,$dir/egs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)" + fi for x in $(seq $num_archives_intermediate); do for y in $(seq $archives_multiple); do archive_index=$[($x-1)*$archives_multiple+$y] @@ -364,8 +396,17 @@ if [ $stage -le 5 ]; then $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \ nnet3-copy-egs ark:- $output_archives || exit 1; - fi + if $generate_egs_scp; then + #concatenate egs.JOB.scp in single egs.scp + rm -rf $dir/egs.scp + for j in $(seq $num_archives_intermediate); do + for y in $(seq $num_archives_intermediate); do + cat $dir/egs.$j.$y.scp || exit 1; + done + done > $dir/egs.scp || exit 1; + fi + fi fi if [ $stage -le 6 ]; then diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py new file mode 100644 index 00000000000..3c77f0ae00e --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + +# This is a module with methods which will be used by scripts for training of +# recurrent neural network acoustic model and raw model (i.e., generic neural +# network without transition model) with frame-level objectives. + +import logging +import imp + +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) + +# this is the main method which differs between RNN and DNN training +def TrainNewModels(dir, iter, srand, num_jobs, + num_archives_processed, num_archives, + raw_model_string, egs_dir, + left_context, right_context, min_deriv_time, + momentum, max_param_change, + shuffle_buffer_size, num_chunk_per_minibatch, + cache_read_opt, run_opts): + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + processes = [] + for job in range(1,num_jobs+1): + k = num_archives_processed + job - 1 # k is a zero-based index that we will derive + # the other indexes from. + archive_index = (k % num_archives) + 1 # work out the 1-based archive index. + + cache_write_opt = "" + if job == 1: + # an option for writing cache (storing pairs of nnet-computations and + # computation-requests) during training. + cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1) + + process_handle = nnet3_train_lib.RunKaldiCommand(""" +{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ + nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \ + --print-interval=10 --momentum={momentum} \ + --max-param-change={max_param_change} \ + --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \ + "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ + {dir}/{next_iter}.{job}.raw + """.format(command = run_opts.command, + train_queue_opt = run_opts.train_queue_opt, + dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job, + parallel_train_opts = run_opts.parallel_train_opts, + cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt, + momentum = momentum, max_param_change = max_param_change, + min_deriv_time = min_deriv_time, + raw_model = raw_model_string, context_opts = context_opts, + egs_dir = egs_dir, archive_index = archive_index, + shuffle_buffer_size = shuffle_buffer_size, + num_chunk_per_minibatch = num_chunk_per_minibatch), + wait = False) + + processes.append(process_handle) + + all_success = True + for process in processes: + process.wait() + [stdout_value, stderr_value] = process.communicate() + print(stderr_value) + if process.returncode != 0: + all_success = False + + if not all_success: + open('{0}/.error'.format(dir), 'w').close() + raise Exception("There was error during training iteration {0}".format(iter)) + +def TrainOneIteration(dir, iter, srand, egs_dir, + num_jobs, num_archives_processed, num_archives, + learning_rate, shrinkage_value, num_chunk_per_minibatch, + num_hidden_layers, add_layers_period, + left_context, right_context, min_deriv_time, + momentum, max_param_change, shuffle_buffer_size, + cv_minibatch_size, run_opts, + compute_accuracy = True, get_raw_nnet_from_am = True): + + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + logger.info("Training neural net (pass {0})".format(iter)) + + # check if different iterations use the same random seed + if os.path.exists('{0}/srand'.format(dir)): + try: + saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip()) + except IOError, ValueError: + raise Exception('Exception while reading the random seed for training') + if srand != saved_srand: + logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand)) + else: + f = open('{0}/srand'.format(dir), 'w') + f.write(str(srand)) + f.close() + + nnet3_train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, + mb_size=cv_minibatch_size, + get_raw_nnet_from_am = get_raw_nnet_from_am, + compute_accuracy = compute_accuracy) + + if iter > 0: + nnet3_train_lib.ComputeProgress(dir, iter, egs_dir, run_opts, + mb_size=cv_minibatch_size, + get_raw_nnet_from_am = get_raw_nnet_from_am) + + # an option for writing cache (storing pairs of nnet-computations + # and computation-requests) during training. + cache_read_opt = "" + if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): + + do_average = False # if we've just added new hiden layer, don't do + # averaging but take the best. + cur_num_hidden_layers = 1 + iter / add_layers_period + config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) + if get_raw_nnet_from_am: + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file) + else: + raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file) + else: + do_average = True + if iter == 0: + do_average = False # on iteration 0, pick the best, don't average. + else: + cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) + if get_raw_nnet_from_am: + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) + else: + raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter) + + if do_average: + cur_num_chunk_per_minibatch = num_chunk_per_minibatch + else: + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 + + try: + os.remove("{0}/.error".format(dir)) + except OSError: + pass + + TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + left_context, right_context, min_deriv_time, + momentum, max_param_change, + shuffle_buffer_size, cur_num_chunk_per_minibatch, + cache_read_opt, run_opts) + [models_to_average, best_model] = nnet3_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) + nnets_list = [] + for n in models_to_average: + nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) + + if do_average: + # average the output of the different jobs. + nnet3_train_lib.GetAverageNnetModel( + dir = dir, iter = iter, + nnets_list = " ".join(nnets_list), + run_opts = run_opts, + get_raw_nnet_from_am = get_raw_nnet_from_am, + shrink = shrinkage_value) + + else: + # choose the best model from different jobs + nnet3_train_lib.GetBestNnetModel( + dir = dir, iter = iter, + best_model_index = best_model, + run_opts = run_opts, + get_raw_nnet_from_am = get_raw_nnet_from_am, + shrink = shrinkage_value) + + try: + for i in range(1, num_jobs + 1): + os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) + except OSError: + raise Exception("Error while trying to delete the raw models") + + if get_raw_nnet_from_am: + new_model = "{0}/{1}.mdl".format(dir, iter + 1) + else: + new_model = "{0}/{1}.raw".format(dir, iter + 1) + + if not os.path.isfile(new_model): + raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) + elif os.stat(new_model).st_size == 0: + raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) + if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)): + os.remove("{0}/cache.{1}".format(dir, iter)) + + diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py new file mode 100644 index 00000000000..f1ad2b797e2 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py @@ -0,0 +1,357 @@ +#!/usr/bin/env python + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + +# This is a module with methods which will be used by scripts for training of +# deep neural network acoustic model and raw model (i.e., generic neural +# network without transition model) with frame-level objectives. + +import logging +import math +import imp +import os +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) + +def AddCommonTrainArgs(parser): + # feat options + parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', + default = None, action = nnet3_train_lib.NullstrToNoneAction, + help="""directory with the ivectors extracted in + an online fashion.""") + parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', + default = None, action = nnet3_train_lib.NullstrToNoneAction, + help="A string specifying '--norm-means' and '--norm-vars' values") + + # egs extraction options + parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', + default = None, action = nnet3_train_lib.NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + parser.add_argument("--egs.dir", type=str, dest='egs_dir', + default = None, action = nnet3_train_lib.NullstrToNoneAction, + help="""Directory with egs. If specified this directory + will be used rather than extracting egs""") + parser.add_argument("--egs.stage", type=int, dest='egs_stage', + default = 0, help="Stage at which get_egs.sh should be restarted") + parser.add_argument("--egs.opts", type=str, dest='egs_opts', + default = None, action = nnet3_train_lib.NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + + # trainer options + parser.add_argument("--trainer.srand", type=int, dest='srand', + default = 0, + help="Sets the random seed for model initialization and egs shuffling. " + "Warning: This random seed does not control all aspects of this experiment. " + "There might be other random seeds used in other stages of the experiment " + "like data preparation (e.g. volume perturbation).") + parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', + default = 8, + help="Number of epochs to train the model") + parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', + default = 20000, + help="Number of samples for computing priors") + parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', + default = 10, + help="The prior computation jobs are single threaded and run on the CPU") + parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges") + parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', + default = 5000, + help=""" Controls randomization of the samples on each + iteration. If 0 or a large value the randomization is + complete, but this will consume memory and cause spikes + in disk I/O. Smaller is easier on disk and memory but + less random. It's not a huge deal though, as samples + are anyway randomized right at the start. + (the point of this is to get data in different + minibatches on different iterations, since in the + preconditioning method, 2 samples in the same minibatch + can affect each others' gradients.""") + parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', + default=2, + help="The number of iterations between adding layers" + "during layer-wise discriminative training.") + parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', + default=2.0, + help="""The maximum change in parameters allowed + per minibatch, measured in Frobenius norm over + the entire model""") + parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', + default=400000, + help="This is really the number of egs in each archive.") + parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', + default=4.0, + help="""Value used in preconditioning matrix estimation""") + parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', + default=10, + help="""Max number of jobs used for LDA stats accumulation""") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', + default = 0.0003, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', + default = 0.00003, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', + default = 1, + help="Number of neural net jobs to run in parallel at the start of training") + parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', + default = 8, + help="Number of neural net jobs to run in parallel at the end of training") + parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help = """ The is the maximum number of models we give to the + final 'combine' stage, but these models will themselves + be averages of iteration-number ranges. """) + parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', + default = 0.0, + help="""Momentum used in update computation. + Note: we implemented it in such a way that + it doesn't increase the effective learning rate.""") + # General options + parser.add_argument("--stage", type=int, default=-4, + help="Specifies the stage of the experiment to execution from") + parser.add_argument("--exit-stage", type=int, default=None, + help="If specified, training exits before running this stage") + parser.add_argument("--cmd", type=str, action = nnet3_train_lib.NullstrToNoneAction, + dest = "command", + help="""Specifies the script to launch jobs. + e.g. queue.pl for launching on SGE cluster + run.pl for launching on local machine + """, default = "queue.pl") + parser.add_argument("--egs.cmd", type=str, action = nnet3_train_lib.NullstrToNoneAction, + dest = "egs_command", + help="""Script to launch egs jobs""", default = "queue.pl") + parser.add_argument("--use-gpu", type=str, action = nnet3_train_lib.StrToBoolAction, + choices = ["true", "false"], + help="Use GPU for training", default=True) + parser.add_argument("--cleanup", type=str, action = nnet3_train_lib.StrToBoolAction, + choices = ["true", "false"], + help="Clean up models after training", default=True) + parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', + default = True, action = nnet3_train_lib.StrToBoolAction, + choices = ["true", "false"], + help="""If true, remove egs after experiment""") + parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", + type=int, default=100, + help="Determines iterations for which models will be preserved during cleanup. If iter MOD preserve_model_interval == 0 model will be preserved.") + + parser.add_argument("--reporting.email", dest = "email", + type=str, default=None, action = nnet3_train_lib.NullstrToNoneAction, + help=""" Email-id to report about the progress of the experiment. + NOTE: It assumes the machine on which the script is being run can send + emails from command line via. mail program. The + Kaldi mailing list will not support this feature. + It might require local expertise to setup. """) + parser.add_argument("--reporting.interval", dest = "reporting_interval", + type=int, default=0.1, + help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") + +# a class to store run options +class RunOpts: + def __init__(self): + self.command = None + self.train_queue_opt = None + self.combine_queue_opt = None + self.prior_gpu_opt = None + self.prior_queue_opt = None + self.parallel_train_opts = None + +# this is the main method which differs between RNN and DNN training +def TrainNewModels(dir, iter, srand, num_jobs, + num_archives_processed, num_archives, + raw_model_string, egs_dir, frames_per_eg, + left_context, right_context, + momentum, max_param_change, + shuffle_buffer_size, minibatch_size, + cache_read_opt, run_opts): + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + processes = [] + for job in range(1,num_jobs+1): + k = num_archives_processed + job - 1 # k is a zero-based index that we will derive + # the other indexes from. + archive_index = (k % num_archives) + 1 # work out the 1-based archive index. + frame = (k / num_archives) % frames_per_eg + + cache_write_opt = "" + if job == 1: + # an option for writing cache (storing pairs of nnet-computations and + # computation-requests) during training. + cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1) + + egs_for_train_string = nnet3_train_lib.ExampleString(egs_dir, minibatch_size, + context_opts = context_opts, archive_index = archive_index, + iter = iter, shuffle_buffer_size = shuffle_buffer_size, + frame = frame) + + process_handle = nnet3_train_lib.RunKaldiCommand(""" +{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ + nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \ + --print-interval=20 --momentum={momentum} \ + --max-param-change={max_param_change} \ + "{raw_model}" \ + "{egs_for_train}" \ + {dir}/{next_iter}.{job}.raw + """.format(command = run_opts.command, + train_queue_opt = run_opts.train_queue_opt, + dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job, + parallel_train_opts = run_opts.parallel_train_opts, + cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt, + momentum = momentum, max_param_change = max_param_change, + raw_model = raw_model_string, + egs_for_train = egs_for_train_string), + wait = False) + + processes.append(process_handle) + + all_success = True + for process in processes: + process.wait() + [stdout_value, stderr_value] = process.communicate() + print(stderr_value) + if process.returncode != 0: + all_success = False + + if not all_success: + open('{0}/.error'.format(dir), 'w').close() + raise Exception("There was error during training iteration {0}".format(iter)) + +def TrainOneIteration(dir, iter, srand, egs_dir, + num_jobs, num_archives_processed, num_archives, + learning_rate, minibatch_size, + frames_per_eg, num_hidden_layers, add_layers_period, + left_context, right_context, + momentum, max_param_change, shuffle_buffer_size, + run_opts, + compute_accuracy = True, get_raw_nnet_from_am = True): + + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + logger.info("Training neural net (pass {0})".format(iter)) + + # check if different iterations use the same random seed + if os.path.exists('{0}/srand'.format(dir)): + try: + saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip()) + except IOError, ValueError: + raise Exception('Exception while reading the random seed for training') + if srand != saved_srand: + logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand)) + else: + f = open('{0}/srand'.format(dir), 'w') + f.write(str(srand)) + f.close() + + nnet3_train_lib.ComputeTrainCvProbabilities( + dir, iter, egs_dir, run_opts, + get_raw_nnet_from_am = get_raw_nnet_from_am, + compute_accuracy = compute_accuracy) + + if iter > 0: + nnet3_train_lib.ComputeProgress( + dir, iter, egs_dir, run_opts, + get_raw_nnet_from_am = get_raw_nnet_from_am) + + # an option for writing cache (storing pairs of nnet-computations + # and computation-requests) during training. + cache_read_opt = "" + if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): + + do_average = False # if we've just added new hiden layer, don't do + # averaging but take the best. + cur_num_hidden_layers = 1 + iter / add_layers_period + config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) + if get_raw_nnet_from_am: + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file) + else: + raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file) + else: + do_average = True + if iter == 0: + do_average = False # on iteration 0, pick the best, don't average. + else: + cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) + if get_raw_nnet_from_am: + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) + else: + raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter) + + if do_average: + cur_minibatch_size = minibatch_size + cur_max_param_change = max_param_change + else: + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + cur_minibatch_size = minibatch_size / 2 + cur_max_param_change = float(max_param_change) / math.sqrt(2) + + try: + os.remove("{0}/.error".format(dir)) + except OSError: + pass + + TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, frames_per_eg, + left_context, right_context, + momentum, max_param_change, + shuffle_buffer_size, cur_minibatch_size, + cache_read_opt, run_opts) + [models_to_average, best_model] = nnet3_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) + nnets_list = [] + for n in models_to_average: + nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) + + if do_average: + # average the output of the different jobs. + nnet3_train_lib.GetAverageNnetModel( + dir = dir, iter = iter, + nnets_list = " ".join(nnets_list), + run_opts = run_opts, + get_raw_nnet_from_am = get_raw_nnet_from_am) + else: + # choose the best model from different jobs + nnet3_train_lib.GetBestNnetModel( + dir = dir, iter = iter, + best_model_index = best_model, + run_opts = run_opts, + get_raw_nnet_from_am = get_raw_nnet_from_am) + + try: + for i in range(1, num_jobs + 1): + os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) + except OSError: + raise Exception("Error while trying to delete the raw models") + + if get_raw_nnet_from_am: + new_model = "{0}/{1}.mdl".format(dir, iter + 1) + else: + new_model = "{0}/{1}.raw".format(dir, iter + 1) + + if not os.path.isfile(new_model): + raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) + elif os.stat(new_model).st_size == 0: + raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) + if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)): + os.remove("{0}/cache.{1}".format(dir, iter)) diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py index 53739f0f9ce..996d64eef2e 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py @@ -50,6 +50,19 @@ def GetArgs(): default=0.0) parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction, help="add the final softmax layer ", default=True, choices = ["false", "true"]) + parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction, + help="If \"true\" an LDA matrix computed from the input features " + "(spliced according to the first set of splice-indexes) will be used as " + "the first Affine layer. This affine layer's parameters are fixed during training. " + "This variable needs to be set to \"false\" when using dense-targets " + "or when --add-idct is set to \"true\".", + default=True, choices = ["false", "true"]) + parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction, + help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.", + choices=['true', 'false'], default = False) + parser.add_argument("--objective-type", type=str, default="linear", + choices = ["linear", "quadratic"], + help = "the type of objective; i.e. quadratic or linear") # LSTM options parser.add_argument("--num-lstm-layers", type=int, @@ -86,6 +99,16 @@ def GetArgs(): parser.add_argument("--lstm-delay", type=str, default=None, help="option to have different delays in recurrence for each lstm") + # Options to convert input MFCC into Fbank features. This is useful when a + # LDA layer is not added (such as when using dense targets) + parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter", + help="The factor used for determining the liftering vector in the production of MFCC. " + "User has to ensure that it matches the lifter used in MFCC generation, " + "e.g. 22.0", default=22.0) + parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction, + help="Add an IDCT after input to convert MFCC to Fbank", + default = False, choices = ["true", "false"]) + parser.add_argument("config_dir", help="Directory to write config files and variables") @@ -115,6 +138,9 @@ def CheckArgs(args): if not args.feat_dim > 0: raise Exception("feat-dim has to be postive") + if args.add_lda and args.add_idct: + raise Exception("add-idct can be true only if add-lda is false") + if not args.num_targets > 0: print(args.num_targets) raise Exception("num_targets has to be positive") @@ -208,28 +234,39 @@ def ParseLstmDelayString(lstm_delay): return lstm_delay_array -def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, +def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda, + add_idct, cepstral_lifter, splice_indexes, lstm_delay, cell_dim, hidden_dim, recurrent_projection_dim, non_recurrent_projection_dim, num_lstm_layers, num_hidden_layers, norm_based_clipping, clipping_threshold, ng_per_element_scale_options, ng_affine_options, - label_delay, include_log_softmax, xent_regularize, + label_delay, include_log_softmax, add_final_sigmoid, + objective_type, xent_regularize, self_repair_scale_nonlinearity, self_repair_scale_clipgradient): config_lines = {'components':[], 'component-nodes':[]} + if add_idct: + nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat") + config_files={} - prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) + prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], + ivector_dim, + idct_mat = config_dir.strip() + "/idct.mat" if add_idct else None) # Add the init config lines for estimating the preconditioning matrices init_config_lines = copy.deepcopy(config_lines) init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') init_config_lines['components'].insert(0, '# preconditioning matrix computation') - nodes.AddOutputLayer(init_config_lines, prev_layer_output) + nodes.AddOutputLayer(init_config_lines, prev_layer_output, label_delay = label_delay, objective_type = objective_type) config_files[config_dir + '/init.config'] = init_config_lines - prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') + # add_lda needs to be set "false" when using dense targets, + # or if the task is not a simple classification task + # (e.g. regression, multi-task) + if add_lda: + prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat') for i in range(num_lstm_layers): if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer @@ -248,7 +285,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient) # make the intermediate config file for layerwise discriminative # training - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type) if xent_regularize != 0.0: @@ -265,7 +302,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity) # make the intermediate config file for layerwise discriminative # training - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type) if xent_regularize != 0.0: nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, @@ -293,14 +330,6 @@ def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layer if (num_hidden_layers < num_lstm_layers): raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes") - # write the files used by other scripts like steps/nnet3/get_egs.sh - f = open(config_dir + "/vars", "w") - print('model_left_context=' + str(left_context), file=f) - print('model_right_context=' + str(right_context), file=f) - print('num_hidden_layers=' + str(num_hidden_layers), file=f) - # print('initial_right_context=' + str(splice_array[0][-1]), file=f) - f.close() - return [left_context, right_context, num_hidden_layers, splice_indexes] @@ -308,9 +337,22 @@ def Main(): args = GetArgs() [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers) + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(args.config_dir + "/vars", "w") + print('model_left_context=' + str(left_context), file=f) + print('model_right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + print('num_targets=' + str(args.num_targets), file=f) + print('objective_type=' + str(args.objective_type), file=f) + print('add_lda=' + ("true" if args.add_lda else "false"), file=f) + print('include_log_softmax=' + ("true" if args.include_log_softmax else "false"), file=f) + f.close() + MakeConfigs(config_dir = args.config_dir, feat_dim = args.feat_dim, ivector_dim = args.ivector_dim, num_targets = args.num_targets, + add_lda = args.add_lda, + add_idct = args.add_idct, cepstral_lifter = args.cepstral_lifter, splice_indexes = splice_indexes, lstm_delay = args.lstm_delay, cell_dim = args.cell_dim, hidden_dim = args.hidden_dim, @@ -324,6 +366,8 @@ def Main(): ng_affine_options = args.ng_affine_options, label_delay = args.label_delay, include_log_softmax = args.include_log_softmax, + add_final_sigmoid = args.add_final_sigmoid, + objective_type = args.objective_type, xent_regularize = args.xent_regularize, self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity, self_repair_scale_clipgradient = args.self_repair_scale_clipgradient) diff --git a/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh b/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh new file mode 100755 index 00000000000..7af10014f2c --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# 2016 Pegah Ghahremani +# Apache 2.0 +# This script dumps bottleneck feature for model trained using nnet3. + +# Begin configuration section. +stage=1 +nj=4 +cmd=run.pl +use_gpu=false +ivector_dir= +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "usage: steps/nnet3/dump_bottleneck_features.sh " + echo "e.g.: steps/nnet3/dump_bottleneck_features.sh data/train data/train_bnf exp/nnet3/tdnn_bnf bnf exp_bnf/dump_bnf" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ivector-dir # directory for ivectors" + exit 1; +fi + +data=$1 +bnf_data=$2 +nnetdir=$3 +archivedir=$4 +dir=$5 + +# Assume that final.nnet is in nnetdir +cmvn_opts=`cat $nnetdir/cmvn_opts`; +bnf_nnet=$nnetdir/final.raw +node_name=Tdnn_Bottleneck_renorm +if [ ! -f $bnf_nnet ] ; then + echo "No such file $bnf_nnet"; + exit 1; +fi + +if $use_gpu; then + compute_queue_opt="--gpu 1" + compute_gpu_opt="--use-gpu=yes" + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + compute_gpu_opt="--use-gpu=no" +fi + + +## Set up input features of nnet +name=`basename $data` +sdata=$data/split$nj + +mkdir -p $dir/log +mkdir -p $bnf_data +echo $nj > $nnetdir/num_jobs +splice_opts=`cat $nnetdir/splice_opts 2>/dev/null` +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +if [ "$ivector_dir" != "" ];then + use_ivector=true +fi + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +ivec_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $ivector_dir/ivector_online.scp |" + +if [ $stage -le 1 ]; then + echo "$0: Generating bottle-neck features" + echo output-node name=output input=$node_name > output.config + modified_bnf_nnet="nnet3-copy --nnet-config=output.config $bnf_nnet - |" + ivector_opts= + if $use_ivector; then + ivec_period=`grep ivector-period $ivector_dir/conf/ivector_extractor.conf | cut -d"=" -f2` + ivector_opts="--online-ivector-period=$ivec_period --online-ivectors='$ivec_feats'" + fi + $cmd $compute_queue_opt JOB=1:$nj $dir/log/make_bnf_$name.JOB.log \ + nnet3-compute $compute_gpu_opt $ivector_opts "$modified_bnf_nnet" "$feats" ark:- \| \ + copy-feats ark:- ark,scp:$archivedir/raw_bnfeat_$name.JOB.ark,$archivedir/raw_bnfeat_$name.JOB.scp || exit 1; +fi + +rm $dir/trans.ark 2>/dev/null + +N0=$(cat $data/feats.scp | wc -l) +N1=$(cat $archivedir/raw_bnfeat_$name.*.scp | wc -l) +if [[ "$N0" != "$N1" ]]; then + echo "Error happens when generating BNF for $name (Original:$N0 BNF:$N1)" + exit 1; +fi + +# Concatenate feats.scp into bnf_data +for n in $(seq $nj); do cat $archivedir/raw_bnfeat_$name.$n.scp; done > $bnf_data/feats.scp + +for f in segments spk2utt text utt2spk wav.scp char.stm glm kws reco2file_and_channel stm; do + [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f +done + +echo "$0: computing CMVN stats." +steps/compute_cmvn_stats.sh $bnf_data $dir $archivedir + +echo "$0: done making BNF feats.scp." + +exit 0; diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py new file mode 100644 index 00000000000..ca068c7e6c5 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python + +# This script generates egs.Archive.scp and ranges.* used for generating egs.Archive.scp +# for multilingual setup. +# Also this script generates outputs.*.scp and weight.*.scp, where each line +# corresponds to language-id and weight for the same example in egs.*.scp. +# weight.*.scp used to scale the output's posterior during training. +# ranges.*.scp is generated w.r.t frequency distribution of remaining examples +# in each language. +# +# You call this script as (e.g.) +# +# allocate_multilingual_examples.py [opts] num-of-languages example-scp-lists multilingual-egs-dir +# +# allocate_multilingual_examples.py --num-jobs 10 --samples-per-iter 10000 --minibatch-size 512 +# --lang2weight exp/multi/lang2weight 2 "exp/lang1/egs.scp exp/lang2/egs.scp" +# exp/multi/egs +# +# This script outputs specific ranges.* files to the temp directory (exp/multi/egs/temp) +# that will enable you to creat egs.*.scp files for multilingual training. +# exp/multi/egs/temp/ranges.* contains something like the following: +# e.g. +# lang1 0 0 256 +# lang2 1 256 256 +# +# where each line can be interpreted as follows: +# +# +# note that is the zero-based line number in egs.scp for +# that language. +# num-examples is multiple of actual minibatch-size. +# +# +# egs.1.scp is generated using ranges.1.scp as following: +# "num_examples" consecutive examples starting from line "local-scp-line" from +# egs.scp file for language "source-lang" is copied to egs.1.scp. +# +# + +from __future__ import print_function +import re, os, argparse, sys, math, warnings, random, io, imp +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +def GetArgs(): + + parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and weights.* files " + "in preparation for dumping egs for multilingual training.", + epilog="Called by steps/nnet3/multilingual/get_egs.sh") + parser.add_argument("--samples-per-iter", type=int, default=40000, + help="The target number of egs in each archive of egs, " + "(prior to merging egs). "); + parser.add_argument("--num-jobs", type=int, default=20, + help="This can be used for better randomness in distributing languages across archives." + ", where egs.job.archive.scp generated randomly and examples are combined " + " across all jobs as eg.archive.scp.") + parser.add_argument("--random-lang", type=str, action=nnet3_train_lib.StrToBoolAction, + help="If true, the lang-id in ranges.* selected" + " w.r.t frequency distribution of remaining examples in each language," + " otherwise it is selected sequentially.", + default=True, choices = ["false", "true"]) + parser.add_argument("--max-archives", type=int, default=1000, + help="max number of archives used to generate egs.*.scp"); + parser.add_argument("--seed", type=int, default=1, + help="Seed for random number generator") + + parser.add_argument("--minibatch-size", type=int, default=512, + help="The minibatch size used to generate scp files per job. " + "It should be multiple of actual minibatch size."); + + parser.add_argument("--prefix", type=str, default="", + help="Adds a prefix to the range files. This is used to distinguish between the train " + "and diagnostic files.") + + parser.add_argument("--lang2weight", type=str, + help="lang2weight file contains the weight per language to scale output posterior for that language.(format is: " + " )"); +# now the positional arguments + parser.add_argument("num_langs", type=int, + help="num of languages used in multilingual training setup."); + parser.add_argument("egs_scp_lists", type=str, + help="list of egs.scp files per input language." + "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp"); + + parser.add_argument("egs_dir", + help="Name of egs directory e.g. exp/multilingual_a/egs"); + + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + return args + + +# Returns a random language number w.r.t +# amount of examples in each language. +# It works based on sampling from a +# discrete distribution, where it returns i +# with prob(i) as (num_egs in lang(i)/ tot_egs). +# tot_egs is sum of lang_len. +def RandomLang(lang_len, tot_egs, random_selection): + assert(tot_egs > 0) + rand_int = random.randint(0, tot_egs - 1) + count = 0 + for l in range(len(lang_len)): + if random_selection: + if rand_int > count and rand_int <= (count + lang_len[l]): + rand_lang = l + break + else: + count += lang_len[l] + else: + if (lang_len[l] > 0): + rand_lang = l + break + assert(rand_lang >= 0 and rand_lang < len(lang_len)) + return rand_lang + +# Read lang2weight file and return lang2weight array +# where lang2weight[i] is weight for language i. +def ReadLang2weight(lang2w_file): + f = open(lang2w_file, "r"); + if f is None: + raise Exception("Error opening lang2weight file " + str(lang2w_file)) + lang2w = [] + for line in f: + a = line.split() + if len(a) != 2: + raise Exception("bad line in lang2weight file " + line) + lang2w.append(int(a[1])) + f.close() + return lang2w + +# struct to keep archives correspond to each job +class ArchiveToJob(): + def __init__(self, job_id, archives_for_job): + self.job_id = job_id + self.archives = archives_for_job + +def Main(): + args = GetArgs() + random.seed(args.seed) + num_langs = args.num_langs + rand_select = args.random_lang + + # read egs.scp for input languages + scp_lists = args.egs_scp_lists.split(); + assert(len(scp_lists) == num_langs); + + scp_files = [open(scp_lists[lang], 'r') for lang in range(num_langs)] + + # computes lang2len, where lang2len[i] shows number of + # examples for language i. + lang2len = [0] * num_langs + for lang in range(num_langs): + lang2len[lang] = sum(1 for line in open(scp_lists[lang])) + print("Number of examples for language {0} is {1}".format(lang, lang2len[lang])) + + # If weights are not provided, the scaling weights + # are one. + if args.lang2weight is None: + lang2weight = [ 1.0 ] * num_langs + else: + lang2weight = ReadLang2Len(args.lang2weight) + assert(len(lang2weight) == num_langs) + + if not os.path.exists(args.egs_dir + "/temp"): + os.makedirs(args.egs_dir + "/temp") + + num_lang_file = open(args.egs_dir + "/info/" + args.prefix + "num_lang", "w"); + print("{0}".format(num_langs), file = num_lang_file) + + + # Each element of all_egs (one per num_archive * num_jobs) is + # an array of 3-tuples (lang-id, local-start-egs-line, num-egs) + all_egs = [] + lang_len = lang2len[:] + tot_num_egs = sum(lang2len[i] for i in range(len(lang2len))) # total num of egs in all languages + num_archives = max(1, min(args.max_archives, tot_num_egs / args.samples_per_iter)) + + + num_arch_file = open(args.egs_dir + "/info/" + args.prefix + "num_archives", "w"); + print("{0}".format(num_archives), file = num_arch_file) + num_arch_file.close() + + this_num_egs_per_archive = tot_num_egs / (num_archives * args.num_jobs) # num of egs per archive + for job_index in range(args.num_jobs): + for archive_index in range(num_archives): + # Temporary scp.job_index.archive_index files to store egs.scp correspond to each archive. + print("Processing archive {0} for job {1}".format(archive_index + 1, job_index + 1)) + archfile = open(args.egs_dir + "/temp/" + args.prefix + "scp." + str(job_index + 1) + "." + str(archive_index + 1), "w") + + this_egs = [] # this will be array of 2-tuples (lang-id start-frame num-frames) + + num_egs = 0 + while num_egs <= this_num_egs_per_archive: + rem_egs = sum(lang_len[i] for i in range(len(lang_len))) + if rem_egs > 0: + lang_id = RandomLang(lang_len, rem_egs, rand_select) + start_egs = lang2len[lang_id] - lang_len[lang_id] + this_egs.append((lang_id, start_egs, args.minibatch_size)) + for scpline in range(args.minibatch_size): + print("{0} {1}".format(scp_files[lang_id].readline().splitlines()[0], lang_id), file = archfile) + + lang_len[lang_id] = lang_len[lang_id] - args.minibatch_size + num_egs = num_egs + args.minibatch_size; + # If the num of remaining egs in each lang is less than minibatch_size, + # they are discarded. + if lang_len[lang_id] < args.minibatch_size: + lang_len[lang_id] = 0 + print("Run out of data for language {0}".format(lang_id)) + else: + print("Run out of data for all languages.") + break + all_egs.append(this_egs) + archfile.close() + + # combine examples across all jobs correspond to each archive. + for archive in range(num_archives): + print("Processing archive {0} by combining all jobs.".format(archive + 1)) + this_ranges = [] + f = open(args.egs_dir + "/temp/" + args.prefix + "ranges." + str(archive + 1), "w") + o = open(args.egs_dir + "/" + args.prefix + "output." + str(archive + 1), "w") + w = open(args.egs_dir + "/" + args.prefix + "weight." + str(archive + 1), "w") + scp_per_archive_file = open(args.egs_dir + "/" + args.prefix + "egs." + str(archive + 1), "w") + + # check files befor writing. + if f is None: + raise Exception("Error opening file " + args.egs_dir + "/temp/" + args.prefix + "ranges." + str(job + 1)) + if o is None: + raise Exception("Error opening file " + args.egs_dir + "/" + args.prefix + "output." + str(job + 1)) + if w is None: + raise Exception("Error opening file " + args.egs_dir + "/" + args.prefix + "weight." + str(job + 1)) + if scp_per_archive_file is None: + raise Exception("Error opening file " + args.egs_dir + "/" + args.prefix + "egs." + str(archive + 1), "w") + + for job in range(args.num_jobs): + # combine egs.job.archive.scp across all jobs. + scp = args.egs_dir + "/temp/" + args.prefix + "scp." + str(job + 1) + "." + str(archive + 1) + with open(scp,"r") as scpfile: + for line in scpfile: + scp_line = line.splitlines()[0].split() + print("{0} {1}".format(scp_line[0], scp_line[1]), file=scp_per_archive_file) + print("{0} output-{1}".format(scp_line[0], scp_line[2]), file=o) + print("{0} {1}".format(scp_line[0], lang2weight[int(scp_line[2])]), file=w) + os.remove(scp) + + # combine ranges.* across all jobs for archive + for (lang_id, start_eg_line, num_egs) in all_egs[num_archives * job + archive]: + this_ranges.append((lang_id, start_eg_line, num_egs)) + + # write ranges.archive + for (lang_id, start_eg_line, num_egs) in this_ranges: + print("{0} {1} {2}".format(lang_id, start_eg_line, num_egs), file=f) + + scp_per_archive_file.close() + f.close() + o.close() + w.close() + print("allocate_multilingual_examples.py finished generating " + args.prefix + "egs.*.scp and " + args.prefix + "ranges.* and " + args.prefix + "output.*" + args.prefix + "weight.* files") + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/multilingual/get_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/get_egs.sh new file mode 100755 index 00000000000..f97be948c1f --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/multilingual/get_egs.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# +# This script uses separate input egs directory for each language as input, +# to generate egs.*.scp files in multilingual egs directory +# where the scp line points to the original archive for each egs directory. +# $megs/egs.*.scp is randomized w.r.t language id. +# +# Also this script generates egs.JOB.scp, output.JOB.scp and weight.JOB.scp, +# where output file contains language-id for each example +# and weight file contains weights for scaling output posterior +# for each example w.r.t input language. +# +# Begin configuration section. +cmd=run.pl +minibatch_size=512 # multiple of minibatch used during training. +num_jobs=10 # This can be set to max number of jobs to run in parallel; + # Helps for better randomness across languages + # per archive. +samples_per_iter=400000 # this is the target number of egs in each archive of egs + # (prior to merging egs). We probably should have called + # it egs_per_iter. This is just a guideline; it will pick + # a number that divides the number of samples in the + # entire data. +stage=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +num_langs=$1 +shift 1 +args=("$@") +megs_dir=${args[-1]} # multilingual directory +mkdir -p $megs_dir +mkdir -p $megs_dir/info + +if [ ${#args[@]} != $[$num_langs+1] ]; then + echo "$0: Number of input example dirs provided is not compatible with num_langs $num_langs." + echo "Usage:$0 [opts] ... " + echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs" + exit 1; +fi + +required_files="egs.scp combine.egs.scp train_diagnostic.egs.scp valid_diagnostic.egs.scp" +train_scp_list= +train_diagnostic_scp_list= +valid_diagnostic_scp_list= +combine_scp_list= + +# copy paramters from $egs_dir[0]/info +# into multilingual dir egs_dir/info + +params_to_check="feat_dim ivector_dim left_context right_context frames_per_eg" +for param in $params_to_check; do + cat ${args[0]}/info/$param > $megs_dir/info/$param || exit 1; +done + +for lang in $(seq 0 $[$num_langs-1]);do + multi_egs_dir[$lang]=${args[$lang]} + echo "arg[$lang] = ${args[$lang]}" + for f in $required_files; do + if [ ! -f ${multi_egs_dir[$lang]}/$f ]; then + echo "$0: no such a file ${multi_egs_dir[$lang]}/$f." && exit 1; + fi + done + train_scp_list="$train_scp_list ${args[$lang]}/egs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.egs.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.egs.scp" + combine_scp_list="$combine_scp_list ${args[$lang]}/combine.egs.scp" + + # check parameter dimension to be the same in all egs dirs + for f in $params_to_check; do + f1=`cat $megs_dir/info/$param`; + f2=`cat ${multi_egs_dir[$lang]}/info/$f`; + if [ $f1 != $f1 ]; then + echo "$0: mismatch in dimension for $f parameter in ${multi_egs_dir[$lang]}." + exit 1; + fi + done +done + +if [ $stage -le 0 ]; then + echo "$0: allocating multilingual examples for training." + # Generate egs.*.scp for multilingual setup. + $cmd $megs_dir/log/allocate_multilingual_examples_train.log \ + python steps/nnet3/multilingual/allocate_multilingual_examples.py \ + --minibatch-size $minibatch_size \ + --samples-per-iter $samples_per_iter \ + $num_langs "$train_scp_list" $megs_dir || exit 1; +fi + +if [ $stage -le 1 ]; then + echo "$0: combine combine.egs.scp examples from all langs in $megs_dir/combine.egs.scp." + # Generate combine.egs.scp for multilingual setup. + $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \ + python steps/nnet3/multilingual/allocate_multilingual_examples.py \ + --random-lang false \ + --max-archives 1 --num-jobs 1 \ + --minibatch-size $minibatch_size \ + --prefix "combine." \ + $num_langs "$combine_scp_list" $megs_dir || exit 1; + + echo "$0: combine train_diagnostic.egs.scp examples from all langs in $megs_dir/train_diagnostic.egs.scp." + # Generate train_diagnostic.egs.scp for multilingual setup. + $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \ + python steps/nnet3/multilingual/allocate_multilingual_examples.py \ + --random-lang false \ + --max-archives 1 --num-jobs 1 \ + --minibatch-size $minibatch_size \ + --prefix "train_diagnostic." \ + $num_langs "$train_diagnostic_scp_list" $megs_dir || exit 1; + + + echo "$0: combine valid_diagnostic.egs.scp examples from all langs in $megs_dir/valid_diagnostic.egs.scp." + # Generate valid_diagnostic.egs.scp for multilingual setup. + $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \ + python steps/nnet3/multilingual/allocate_multilingual_examples.py \ + --random-lang false --max-archives 1 --num-jobs 1\ + --minibatch-size $minibatch_size \ + --prefix "valid_diagnostic." \ + $num_langs "$valid_diagnostic_scp_list" $megs_dir || exit 1; + +fi + diff --git a/egs/wsj/s5/steps/nnet3/multilingual/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/multilingual/make_tdnn_configs.py new file mode 100755 index 00000000000..9ed80afd1eb --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/multilingual/make_tdnn_configs.py @@ -0,0 +1,555 @@ +#!/usr/bin/env python + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import os +import argparse +import shlex +import sys +import warnings +import copy +import imp +import ast + +nodes = imp.load_source('', 'steps/nnet3/components.py') +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Writes config files and variables " + "for TDNNs creation and training for " + "multilingaul system with multiple output " + "and bottleneck layer", + epilog="See egs/babel_multilingual/s5/local/nnet3/run_tdnn_joint_babel_sp_bnf.sh for example.") + + # Only one of these arguments can be specified, and one of them has to + # be compulsarily specified + feat_group = parser.add_mutually_exclusive_group(required = True) + feat_group.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") + feat_group.add_argument("--feat-dir", type=str, + help="Feature directory, from which we derive the feat-dim") + + # only one of these arguments can be specified + ivector_group = parser.add_mutually_exclusive_group(required = False) + ivector_group.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) + ivector_group.add_argument("--ivector-dir", type=str, + help="iVector dir, which will be used to derive the ivector-dim ", default=None) + + num_target_group = parser.add_mutually_exclusive_group(required = True) + num_target_group.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") + num_target_group.add_argument("--ali-dir", type=str, + help="alignment directory, from which we derive the num-targets") + num_target_group.add_argument("--tree-dir", type=str, + help="directory with final.mdl, from which we derive the num-targets") + num_target_group.add_argument("--num-multiple-targets", type=str, + help="space separated number of network targets for different languages(e.g. num-pdf-ids/num-leaves e.g. '1000 2000 3000')") + + # CNN options + parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer", + help="CNN parameters at each CNN layer, e.g. --filt-x-dim=3 --filt-y-dim=8 " + "--filt-x-step=1 --filt-y-step=1 --num-filters=256 --pool-x-size=1 --pool-y-size=3 " + "--pool-z-size=1 --pool-x-step=1 --pool-y-step=3 --pool-z-step=1, " + "when CNN layers are used, no LDA will be added", default = None) + parser.add_argument("--cnn.bottleneck-dim", type=int, dest = "cnn_bottleneck_dim", + help="Output dimension of the linear layer at the CNN output " + "for dimension reduction, e.g. 256." + "The default zero means this layer is not needed.", default=0) + parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter", + help="The factor used for determining the liftering vector in the production of MFCC. " + "User has to ensure that it matches the lifter used in MFCC generation, " + "e.g. 22.0", default=22.0) + + # General neural network options + parser.add_argument("--splice-indexes", type=str, required = True, + help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3' " + "If CNN layers are used the first set of splice indexes will be used as input " + "to the first CNN layer and later splice indexes will be interpreted as indexes " + "for the TDNNs.") + parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction, + help="If \"true\" an LDA matrix computed from the input features " + "(spliced according to the first set of splice-indexes) will be used as " + "the first Affine layer. This affine layer's parameters are fixed during training. " + "If --cnn.layer is specified this option will be forced to \"false\".", + default=False, choices = ["false", "true"]) + + parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction, + help="add the final softmax layer ", default=True, choices = ["false", "true"]) + parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction, + help="add a final sigmoid layer as alternate to log-softmax-layer. " + "Can only be used if include-log-softmax is false. " + "This is useful in cases where you want the output to be " + "like probabilities between 0 and 1. Typically the nnet " + "is trained with an objective such as quadratic", + default=False, choices = ["false", "true"]) + + parser.add_argument("--objective-type", type=str, + help = "the type of objective; i.e. quadratic or linear", + default="linear", choices = ["linear", "quadratic"]) + parser.add_argument("--xent-regularize", type=float, + help="For chain models, if nonzero, add a separate output for cross-entropy " + "regularization (with learning-rate-factor equal to the inverse of this)", + default=0.0) + parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction, + help="if using --xent-regularize, gives it separate last-but-one weight matrix", + default=False, choices = ["false", "true"]) + parser.add_argument("--final-layer-normalize-target", type=float, + help="RMS target for final layer (set to <1 if final layer learns too fast", + default=1.0) + parser.add_argument("--subset-dim", type=int, default=0, + help="dimension of the subset of units to be sent to the central frame") + parser.add_argument("--pnorm-input-dim", type=int, + help="input dimension to p-norm nonlinearities") + parser.add_argument("--pnorm-output-dim", type=int, + help="output dimension of p-norm nonlinearities") + parser.add_argument("--relu-dim", type=int, + help="dimension of ReLU nonlinearities") + + parser.add_argument("--self-repair-scale-nonlinearity", type=float, + help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None) + + parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction, + help="if true, a presoftmax-prior-scale is added", + choices=['true', 'false'], default = False) + parser.add_argument("config_dir", + help="Directory to write config files and variables") + # multilingual tdnn with bn layer config + parser.add_argument("--bottleneck-layer", type=int, + help="The layer number to add bottleneck layer," + "if < 0, means this layer is not needed in network.", + default=-1) + parser.add_argument("--bottleneck-dim", type=int, + help="The bottleneck layer dimension in TDNN network e.g. 42.", + default=40) + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + + ## Check arguments. + if args.feat_dir is not None: + args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir) + + if args.ali_dir is not None: + args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir) + elif args.tree_dir is not None: + args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir) + + if args.ivector_dir is not None: + args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir) + + if not args.feat_dim > 0: + raise Exception("feat-dim has to be postive") + + if not args.num_targets > 0: + if args.num_multiple_targets is None: + print(args.num_targets) + raise Exception("num_targets or num_multiple_targets has to be positive") + if not args.ivector_dim >= 0: + raise Exception("ivector-dim has to be non-negative") + + if (args.subset_dim < 0): + raise Exception("--subset-dim has to be non-negative") + + if not args.relu_dim is None: + if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None: + raise Exception("--relu-dim argument not compatible with " + "--pnorm-input-dim or --pnorm-output-dim options"); + args.nonlin_input_dim = args.relu_dim + args.nonlin_output_dim = args.relu_dim + args.nonlin_type = 'relu' + else: + if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0: + raise Exception("--relu-dim not set, so expected --pnorm-input-dim and " + "--pnorm-output-dim to be provided."); + args.nonlin_input_dim = args.pnorm_input_dim + args.nonlin_output_dim = args.pnorm_output_dim + if (args.nonlin_input_dim < args.nonlin_output_dim) or (args.nonlin_input_dim % args.nonlin_output_dim != 0): + raise Exception("Invalid --pnorm-input-dim {0} and --pnorm-output-dim {1}".format(args.nonlin_input_dim, args.nonlin_output_dim)) + args.nonlin_type = 'pnorm' + + if args.add_final_sigmoid and args.include_log_softmax: + raise Exception("--include-log-softmax and --add-final-sigmoid cannot both be true.") + + if args.xent_separate_forward_affine and args.add_final_sigmoid: + raise Exception("It does not make sense to have --add-final-sigmoid=true when xent-separate-forward-affine is true") + + if args.add_lda and args.cnn_layer is not None: + args.add_lda = False + warnings.warn("--add-lda is set to false as CNN layers are used.") + + return args + +def AddConvMaxpLayer(config_lines, name, input, args): + if '3d-dim' not in input: + raise Exception("The input to AddConvMaxpLayer() needs '3d-dim' parameters.") + + input = nodes.AddConvolutionLayer(config_lines, name, input, + input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2], + args.filt_x_dim, args.filt_y_dim, + args.filt_x_step, args.filt_y_step, + args.num_filters, input['vectorization']) + + if args.pool_x_size > 1 or args.pool_y_size > 1 or args.pool_z_size > 1: + input = nodes.AddMaxpoolingLayer(config_lines, name, input, + input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2], + args.pool_x_size, args.pool_y_size, args.pool_z_size, + args.pool_x_step, args.pool_y_step, args.pool_z_step) + + return input + +# The ivectors are processed through an affine layer parallel to the CNN layers, +# then concatenated with the CNN output and passed to the deeper part of the network. +def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0): + cnn_args = ParseCnnString(cnn_layer) + num_cnn_layers = len(cnn_args) + # We use an Idct layer here to convert MFCC to FBANK features + nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat") + prev_layer_output = {'descriptor': "input", + 'dimension': feat_dim} + prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat') + + list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes] + splice_descriptor = "Append({0})".format(", ".join(list)) + cnn_input_dim = len(splice_indexes) * feat_dim + prev_layer_output = {'descriptor': splice_descriptor, + 'dimension': cnn_input_dim, + '3d-dim': [len(splice_indexes), feat_dim, 1], + 'vectorization': 'yzx'} + + for cl in range(0, num_cnn_layers): + prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl]) + + if cnn_bottleneck_dim > 0: + prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "") + + if ivector_dim > 0: + iv_layer_output = {'descriptor': 'ReplaceIndex(ivector, t, 0)', + 'dimension': ivector_dim} + iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "") + prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor']) + prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension'] + + return prev_layer_output + +def PrintConfig(file_name, config_lines): + f = open(file_name, 'w') + f.write("\n".join(config_lines['components'])+"\n") + f.write("\n#Component nodes\n") + f.write("\n".join(config_lines['component-nodes'])) + f.close() + +def ParseCnnString(cnn_param_string_list): + cnn_parser = argparse.ArgumentParser(description="cnn argument parser") + + cnn_parser.add_argument("--filt-x-dim", required=True, type=int) + cnn_parser.add_argument("--filt-y-dim", required=True, type=int) + cnn_parser.add_argument("--filt-x-step", type=int, default = 1) + cnn_parser.add_argument("--filt-y-step", type=int, default = 1) + cnn_parser.add_argument("--num-filters", required=True, type=int) + cnn_parser.add_argument("--pool-x-size", type=int, default = 1) + cnn_parser.add_argument("--pool-y-size", type=int, default = 1) + cnn_parser.add_argument("--pool-z-size", type=int, default = 1) + cnn_parser.add_argument("--pool-x-step", type=int, default = 1) + cnn_parser.add_argument("--pool-y-step", type=int, default = 1) + cnn_parser.add_argument("--pool-z-step", type=int, default = 1) + + cnn_args = [] + for cl in range(0, len(cnn_param_string_list)): + cnn_args.append(cnn_parser.parse_args(shlex.split(cnn_param_string_list[cl]))) + + return cnn_args + +def ParseSpliceString(splice_indexes): + splice_array = [] + left_context = 0 + right_context = 0 + split1 = splice_indexes.split(); # we already checked the string is nonempty. + if len(split1) < 1: + raise Exception("invalid splice-indexes argument, too short: " + + splice_indexes) + try: + for string in split1: + split2 = string.split(",") + if len(split2) < 1: + raise Exception("invalid splice-indexes argument, too-short element: " + + splice_indexes) + int_list = [] + for int_str in split2: + int_list.append(int(int_str)) + if not int_list == sorted(int_list): + raise Exception("elements of splice-indexes must be sorted: " + + splice_indexes) + left_context += -int_list[0] + right_context += int_list[-1] + splice_array.append(int_list) + except ValueError as e: + raise Exception("invalid splice-indexes argument " + splice_indexes + str(e)) + left_context = max(0, left_context) + right_context = max(0, right_context) + + return {'left_context':left_context, + 'right_context':right_context, + 'splice_indexes':splice_array, + 'num_hidden_layers':len(splice_array) + } + +# The function signature of MakeConfigs is changed frequently as it is intended for local use in this script. +def MakeConfigs(config_dir, splice_indexes_string, + cnn_layer, cnn_bottleneck_dim, cepstral_lifter, + feat_dim, ivector_dim, num_targets, add_lda, + nonlin_type, nonlin_input_dim, nonlin_output_dim, subset_dim, + use_presoftmax_prior_scale, + final_layer_normalize_target, + include_log_softmax, + add_final_sigmoid, + xent_regularize, + xent_separate_forward_affine, + self_repair_scale, + objective_type, + num_multiple_targets, bottleneck_layer, bottleneck_dim): + + parsed_splice_output = ParseSpliceString(splice_indexes_string.strip()) + + left_context = parsed_splice_output['left_context'] + right_context = parsed_splice_output['right_context'] + num_hidden_layers = parsed_splice_output['num_hidden_layers'] + splice_indexes = parsed_splice_output['splice_indexes'] + input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim + + if xent_separate_forward_affine: + if splice_indexes[-1] != [0]: + raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.") + + prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir) + + config_lines = {'components':[], 'component-nodes':[]} + + config_files={} + prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) + + # Add the init config lines for estimating the preconditioning matrices + init_config_lines = copy.deepcopy(config_lines) + init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') + init_config_lines['components'].insert(0, '# preconditioning matrix computation') + if len(num_multiple_targets) > 1: + for target in range(len(num_multiple_targets)): + nodes.AddOutputLayer(init_config_lines, prev_layer_output, suffix = str(target)) + else: + nodes.AddOutputLayer(init_config_lines, prev_layer_output) + + config_files[config_dir + '/init.config'] = init_config_lines + + if cnn_layer is not None: + prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, + feat_dim, splice_indexes[0], ivector_dim) + + if add_lda: + prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') + + left_context = 0 + right_context = 0 + # we moved the first splice layer to before the LDA.. + # so the input to the first affine layer is going to [0] index + splice_indexes[0] = [0] + + for i in range(0, num_hidden_layers): + # make the intermediate config file for layerwise discriminative training + + # prepare the spliced input + if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0): + try: + zero_index = splice_indexes[i].index(0) + except ValueError: + zero_index = None + # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor + prev_layer_output_descriptor = prev_layer_output['descriptor'] + subset_output = prev_layer_output + if subset_dim > 0: + # if subset_dim is specified the script expects a zero in the splice indexes + assert(zero_index is not None) + subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim) + subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i), + 'dimension' : subset_dim} + config_lines['component-nodes'].append(subset_node_config) + appended_descriptors = [] + appended_dimension = 0 + for j in range(len(splice_indexes[i])): + if j == zero_index: + appended_descriptors.append(prev_layer_output['descriptor']) + appended_dimension += prev_layer_output['dimension'] + continue + appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j])) + appended_dimension += subset_output['dimension'] + prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)), + 'dimension' : appended_dimension} + else: + # this is a normal affine node + pass + + if xent_separate_forward_affine and i == num_hidden_layers - 1: + if xent_regularize == 0.0: + raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero") + + if nonlin_type == "relu" : + prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain", + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = final_layer_normalize_target) + + prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent", + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = final_layer_normalize_target) + elif nonlin_type == "pnorm" : + prev_layer_output_chain = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_chain", + prev_layer_output, nonlin_input_dim, nonlin_output_dim, + norm_target_rms = final_layer_normalize_target) + + prev_layer_output_xent = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_xent", + prev_layer_output, nonlin_input_dim, nonlin_output_dim, + norm_target_rms = final_layer_normalize_target) + else: + raise Exception("Unknown nonlinearity type") + + nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax) + + nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent') + else: + if bottleneck_layer > -1 and i+1 == bottleneck_layer: + print('bottleneck layer and its dimension are {0} and {1} respectively.'.format(bottleneck_layer, bottleneck_dim)) + if nonlin_type == "relu": + prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_Bottleneck".format(i), + prev_layer_output, bottleneck_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + + elif nonlin_type == "pnorm": + prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_Bottleneck".format(i), + prev_layer_output, nonlin_input_dim, bottleneck_dim, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + else: + raise Exception("Unknown nonlinearity type") + else: + if nonlin_type == "relu": + prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i), + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + elif nonlin_type == "pnorm": + prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_{0}".format(i), + prev_layer_output, nonlin_input_dim, nonlin_output_dim, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + else: + raise Exception("Unknown nonlinearity type") + # Add multiple pre-final affine layer and multiple softmax layer correspond + # to each target language. + if len(num_multiple_targets) > 1: + for target in range(len(num_multiple_targets)): + nodes.AddFinalLayer(config_lines, prev_layer_output, + num_multiple_targets[target], + name_affix = 'output-'+str(target), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax, + add_final_sigmoid = add_final_sigmoid, + objective_type = objective_type) + else: + # a final layer is added after each new layer as we are generating + # configs for layer-wise discriminative training + + # add_final_sigmoid adds a sigmoid as a final layer as alternative + # to log-softmax layer. + # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers + # This is useful when you need the final outputs to be probabilities between 0 and 1. + # Usually used with an objective-type such as "quadratic". + # Applications are k-binary classification such Ideal Ratio Mask prediction. + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax, + add_final_sigmoid = add_final_sigmoid, + objective_type = objective_type) + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent') + + config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines + config_lines = {'components':[], 'component-nodes':[]} + + left_context += int(parsed_splice_output['left_context']) + right_context += int(parsed_splice_output['right_context']) + + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(config_dir + "/vars", "w") + print('model_left_context=' + str(left_context), file=f) + print('model_right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + print('num_targets=' + str(num_targets), file=f) + print('add_lda=' + ('true' if add_lda else 'false'), file=f) + print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f) + print('objective_type=' + objective_type, file=f) + f.close() + + # printing out the configs + # init.config used to train lda-mllt train + for key in config_files.keys(): + PrintConfig(key, config_files[key]) + +def Main(): + args = GetArgs() + + if args.num_multiple_targets is not None: + num_multiple_targets = args.num_multiple_targets.split() + print('Number of output targets is {0}'.format(len(num_multiple_targets))) + + MakeConfigs(config_dir = args.config_dir, + splice_indexes_string = args.splice_indexes, + feat_dim = args.feat_dim, ivector_dim = args.ivector_dim, + num_targets = args.num_targets, + add_lda = args.add_lda, + cnn_layer = args.cnn_layer, + cnn_bottleneck_dim = args.cnn_bottleneck_dim, + cepstral_lifter = args.cepstral_lifter, + nonlin_type = args.nonlin_type, + nonlin_input_dim = args.nonlin_input_dim, + nonlin_output_dim = args.nonlin_output_dim, + subset_dim = args.subset_dim, + use_presoftmax_prior_scale = args.use_presoftmax_prior_scale, + final_layer_normalize_target = args.final_layer_normalize_target, + include_log_softmax = args.include_log_softmax, + add_final_sigmoid = args.add_final_sigmoid, + xent_regularize = args.xent_regularize, + xent_separate_forward_affine = args.xent_separate_forward_affine, + self_repair_scale = args.self_repair_scale_nonlinearity, + objective_type = args.objective_type, + num_multiple_targets = num_multiple_targets, + bottleneck_layer = args.bottleneck_layer, + bottleneck_dim = args.bottleneck_dim) + +if __name__ == "__main__": + Main() + diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py index a43aa05176b..c154e39d7a2 100644 --- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py +++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py @@ -24,15 +24,86 @@ def SendMail(message, subject, email_id): logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e))) pass +def IsMultilingual(egs_dir): + # num of langs used to generate egs is written in egs_dir/info/num_lang + # in multilingual setup. + multilingual_training = False + num_lang = 1 + num_lang_str = "{0}/info/num_lang".format(egs_dir) + if (os.path.isfile(num_lang_str)): + + num_lang = int(open(num_lang_str,'r').readline()) + if num_lang > 1: + multilingual_training = True + return multilingual_training + +# Generate example string used during train and compute-prob +# containing shuffle, merge by considering +# multilingual case. +# egs_suffix is empty for egs used for nnet3-train during training and +# is equal to "valid_diagnostic" and "train_diagnostic" during nnet3-compute-prob. +def ExampleString(egs_dir, minibatch_size, + context_opts = None, archive_index = None, + iter = 1, shuffle_buffer_size = 0, + egs_suffix = None, frame = None): + multilingual_training = IsMultilingual(egs_dir) + + frame_opt="" + if frame is not None: + frame_opt="--frame={0}".format(frame) + + # There is no example shuffle for computing diagnostics + shuffle_str="" + if shuffle_buffer_size > 0: + shuffle_str=" nnet3-shuffle-egs --buffer-size={0} --srand={1} ark:- ark:-|".format(shuffle_buffer_size, iter) + + if multilingual_training: + # In multilingual setup, the examples in egs.{archive_index}.scp are written in + # groups of minibatch-size w.r.t language id. + # We first merge examples with same language-id and then shuffle minibatchs. + # The outputs and weight are output.{archive_index} and weight.{archive_index} for training. + egs_str = ("egs" if egs_suffix is None else egs_suffix+".egs")+("."+str(archive_index)+".scp" if archive_index is not None else ".1.scp") + + multilingual_opts="--weights='ark:{0}/{2}weight.{1}' --outputs='ark:{0}/{2}output.{1}'".format(egs_dir, + (1 if archive_index is None else archive_index), + (str(egs_suffix)+"." if egs_suffix is not None else "")) + + egs_for_train_string="ark,bg:nnet3-copy-egs {frame_opt} {context_opts} {multilingual_opts} scp:{egs_dir}/{egs_str} ark:- | nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |{shuffle_str}".format(context_opts = (context_opts if context_opts is not None else ""), + egs_dir = egs_dir, + egs_str = egs_str, + minibatch_size = minibatch_size, + multilingual_opts = multilingual_opts, + frame_opt = frame_opt, + shuffle_str = shuffle_str) + else: + egs_str = ("egs" if egs_suffix is None else egs_suffix+".egs")+("."+str(archive_index)+".ark" if archive_index is not None else "") + egs_for_train_string="ark,bg:nnet3-copy-egs {frame_opt} {context_opts} ark:{egs_dir}/{egs_str} ark:- |{shuffle_str}\ + nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false \ + --discard-partial-minibatches=true ark:- ark:- |\ + ".format(context_opts = (context_opts if context_opts is not None else ""), + egs_dir = egs_dir, + minibatch_size = minibatch_size, + egs_str = egs_str, + frame_opt = frame_opt, + shuffle_str = shuffle_str) + + return egs_for_train_string + +def StrToBool(values): + if values == "true": + return True + elif values == "false": + return False + else: + raise ValueError + class StrToBoolAction(argparse.Action): """ A custom action to convert bools from shell format i.e., true/false to python format i.e., True/False """ def __call__(self, parser, namespace, values, option_string=None): - if values == "true": - setattr(namespace, self.dest, True) - elif values == "false": - setattr(namespace, self.dest, False) - else: + try: + setattr(namespace, self.dest, StrToBool(values)) + except ValueError: raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) class NullstrToNoneAction(argparse.Action): @@ -101,10 +172,68 @@ def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0): accepted_models.append(i+1) if len(accepted_models) != num_models: - logger.warn("Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), num_models, log_file_pattern)) + logger.warn("""Only {0}/{1} of the models have been accepted +for averaging, based on log files {2}.""".format(len(accepted_models), + num_models, log_file_pattern)) return [accepted_models, max_index+1] +def GetAverageNnetModel(dir, iter, nnets_list, run_opts, + get_raw_nnet_from_am = True, shrink = None): + scale = 1.0 + if shrink is not None: + scale = shrink + + new_iter = iter + 1 + if get_raw_nnet_from_am: + out_model = """- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \ +{dir}/{iter}.mdl {dir}/{new_iter}.mdl""".format(dir = dir, iter = iter, + new_iter = new_iter, + scale = scale) + else: + if shrink is not None: + out_model = """- \| nnet3-copy --scale={scale} \ +- {dir}/{new_iter}.raw""".format(dir = dir, new_iter = new_iter, scale = scale) + else: + out_model = "{dir}/{new_iter}.raw".format(dir = dir, + new_iter = new_iter) + + RunKaldiCommand(""" +{command} {dir}/log/average.{iter}.log \ +nnet3-average {nnets_list} \ +{out_model}""".format(command = run_opts.command, + dir = dir, + iter = iter, + nnets_list = nnets_list, + out_model = out_model)) + +def GetBestNnetModel(dir, iter, best_model_index, run_opts, + get_raw_nnet_from_am = True, shrink = None): + scale = 1.0 + if shrink is not None: + scale = shrink + + best_model = '{dir}/{next_iter}.{best_model_index}.raw'.format( + dir = dir, + next_iter = iter + 1, + best_model_index = best_model_index) + + if get_raw_nnet_from_am: + out_model = """- \| nnet3-am-copy --set-raw-nnet=- \ +{dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir = dir, iter = iter, + new_iter = iter + 1) + else: + out_model = '{dir}/{next_iter}.raw'.format(dir = dir, + next_iter = iter + 1) + + RunKaldiCommand(""" +{command} {dir}/log/select.{iter}.log \ +nnet3-copy --scale={scale} {best_model} \ +{out_model}""".format(command = run_opts.command, + dir = dir, iter = iter, + best_model = best_model, + out_model = out_model, scale = scale)) + def GetNumberOfLeaves(alidir): [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir)) parts = stdout.split() @@ -120,6 +249,7 @@ def GetNumberOfJobs(alidir): except IOError, ValueError: raise Exception('Exception while reading the number of alignment jobs') return num_jobs + def GetIvectorDim(ivector_dir = None): if ivector_dir is None: return 0 @@ -132,6 +262,11 @@ def GetFeatDim(feat_dir): feat_dim = int(stdout_val) return feat_dim +def GetFeatDimFromScp(feat_scp): + [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{feat_scp} -".format(feat_scp = feat_scp)) + feat_dim = int(stdout_val) + return feat_dim + def ReadKaldiMatrix(matrix_file): try: lines = map(lambda x: x.split(), open(matrix_file).readlines()) @@ -205,6 +340,28 @@ def ParseModelConfigVarsFile(var_file): raise Exception('Error while parsing the file {0}'.format(var_file)) +def ParseGenericConfigVarsFile(var_file): + variables = {} + try: + var_file_handle = open(var_file, 'r') + for line in var_file_handle: + parts = line.split('=') + field_name = parts[0].strip() + field_value = parts[1].strip() + if field_name in ['model_left_context', 'left_context']: + variables['model_left_context'] = int(field_value) + elif field_name in ['model_right_context', 'right_context']: + variables['model_right_context'] = int(field_value) + elif field_name == 'num_hidden_layers': + variables['num_hidden_layers'] = int(field_value) + else: + variables[field_name] = field_value + return variables + except ValueError: + # we will throw an error at the end of the function so I will just pass + pass + + raise Exception('Error while parsing the file {0}'.format(var_file)) def GenerateEgs(data, alidir, egs_dir, left_context, right_context, @@ -242,6 +399,72 @@ def GenerateEgs(data, alidir, egs_dir, egs_dir = egs_dir, egs_opts = egs_opts if egs_opts is not None else '' )) +def GenerateMultilingualEgs(egs_dirs, run_opts, minibatch_size = 512, + samples_per_iter = 40000, + egs_opts = None, stage = 0): + multi_egs_dir = egs_dirs.split() + + RunKaldiCommand(""" +steps/nnet3/multilingual/get_egs.sh {egs_opts} \ + --cmd "{command}" --stage {stage} \ + --minibatch-size {minibatch_size} \ + --samples-per-iter {samples_per_iter} \ + {num_langs} {egs_dirs} + """.format(command = run_opts.command, + samples_per_iter = samples_per_iter, + minibatch_size = minibatch_size, + num_langs = len(multi_egs_dir) - 1, + stage = stage, + egs_opts = egs_opts if egs_opts is not None else '', + egs_dirs = egs_dirs)) + +def GenerateEgsFromTargets(data, targets_scp, egs_dir, + left_context, right_context, + valid_left_context, valid_right_context, + run_opts, stage = 0, + feat_type = 'raw', online_ivector_dir = None, + target_type = 'dense', num_targets = -1, + samples_per_iter = 20000, frames_per_eg = 20, srand = 0, + egs_opts = None, cmvn_opts = None, transform_dir = None): + if target_type == 'dense': + num_targets = GetFeatDimFromScp(targets_scp) + else: + if num_targets == -1: + raise Exception("--num-targets is required if target-type is dense") + + RunKaldiCommand(""" +steps/nnet3/get_egs_targets.sh {egs_opts} \ + --cmd "{command}" \ + --cmvn-opts "{cmvn_opts}" \ + --feat-type {feat_type} \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{ivector_dir}" \ + --left-context {left_context} --right-context {right_context} \ + --valid-left-context {valid_left_context} \ + --valid-right-context {valid_right_context} \ + --stage {stage} \ + --samples-per-iter {samples_per_iter} \ + --frames-per-eg {frames_per_eg} \ + --srand {srand} \ + --target-type {target_type} \ + --num-targets {num_targets} \ + {data} {targets_scp} {egs_dir} + """.format(command = run_opts.egs_command, + cmvn_opts = cmvn_opts if cmvn_opts is not None else '', + feat_type = feat_type, + transform_dir = transform_dir if transform_dir is not None else '', + ivector_dir = online_ivector_dir if online_ivector_dir is not None else '', + left_context = left_context, right_context = right_context, + valid_left_context = valid_left_context, + valid_right_context = valid_right_context, + stage = stage, samples_per_iter = samples_per_iter, + frames_per_eg = frames_per_eg, srand = srand, + num_targets = num_targets, + data = data, + targets_scp = targets_scp, target_type = target_type, + egs_dir = egs_dir, + egs_opts = egs_opts if egs_opts is not None else '' )) + def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context): try: egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline()) @@ -316,7 +539,7 @@ def ForceSymlink(file1, file2): os.symlink(file1, file2) def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts, - presoftmax_prior_scale_power = None): + presoftmax_prior_scale_power = -0.25): # getting the raw pdf count RunKaldiCommand(""" @@ -336,9 +559,14 @@ def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts, import glob for file in glob.glob('{0}/pdf_counts.*'.format(dir)): os.remove(file) - - smooth=0.01 pdf_counts = ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0] + scaled_counts = SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power = presoftmax_prior_scale_power, smooth = 0.01) + + output_file = "{0}/presoftmax_prior_scale.vec".format(dir) + WriteKaldiMatrix(output_file, [scaled_counts]) + ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir)) + +def SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power = -0.25, smooth = 0.01): total = sum(pdf_counts) average_count = total/len(pdf_counts) scales = [] @@ -346,20 +574,15 @@ def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts, scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power)) num_pdfs = len(pdf_counts) scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales) + return scaled_counts - output_file = "{0}/presoftmax_prior_scale.vec".format(dir) - WriteKaldiMatrix(output_file, [scaled_counts]) - ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir)) def PrepareInitialAcousticModel(dir, alidir, run_opts): """ Adds the first layer; this will also add in the lda.mat and presoftmax_prior_scale.vec. It will also prepare the acoustic model with the transition model.""" - RunKaldiCommand(""" -{command} {dir}/log/add_first_layer.log \ - nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw """.format(command = run_opts.command, - dir = dir)) + PrepareInitialNetwork(dir, run_opts) # Convert to .mdl, train the transitions, set the priors. RunKaldiCommand(""" @@ -369,6 +592,12 @@ def PrepareInitialAcousticModel(dir, alidir, run_opts): """.format(command = run_opts.command, dir = dir, alidir = alidir)) +def PrepareInitialNetwork(dir, run_opts): + RunKaldiCommand(""" +{command} {dir}/log/add_first_layer.log \ + nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw """.format(command = run_opts.command, + dir = dir)) + def VerifyIterations(num_iters, num_epochs, num_hidden_layers, num_archives, max_models_combine, add_layers_period, num_jobs_final): @@ -478,13 +707,17 @@ def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed, return num_jobs * effective_learning_rate -def DoShrinkage(iter, model_file, non_linearity, shrink_threshold): +def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold, + get_raw_nnet_from_am = True): if iter == 0: return True try: - output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file)) + if get_raw_nnet_from_am: + output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file)) + else: + output, error = RunKaldiCommand("nnet3-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file)) output = output.strip().split("\n") # eg. # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591] @@ -506,41 +739,66 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold): return False -def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False): +def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, + wait = False, get_raw_nnet_from_am = True, + compute_accuracy = True): - model = '{0}/{1}.mdl'.format(dir, iter) + if get_raw_nnet_from_am: + model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir = dir, iter = iter) + else: + model = "{dir}/{iter}.raw".format(dir = dir, iter = iter) + + compute_prob_opts = "--compute-accuracy" if compute_accuracy else ""; + compute_prob_opts = "--compute-accuracy" if compute_accuracy else ""; + + valid_egs_for_compute_prob_str = ExampleString(egs_dir, mb_size, + context_opts = None, + egs_suffix = "valid_diagnostic") + + + train_egs_for_compute_prob_str = ExampleString(egs_dir, mb_size, + context_opts = None, + egs_suffix = "train_diagnostic") RunKaldiCommand(""" {command} {dir}/log/compute_prob_valid.{iter}.log \ - nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ - "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |" + nnet3-compute-prob {compute_prob_opts} "{model}" \ + "{egs_string}" """.format(command = run_opts.command, dir = dir, iter = iter, mb_size = mb_size, model = model, - egs_dir = egs_dir), wait = wait) + compute_prob_opts = compute_prob_opts, + egs_dir = egs_dir, + egs_string = valid_egs_for_compute_prob_str), wait = wait) RunKaldiCommand(""" {command} {dir}/log/compute_prob_train.{iter}.log \ - nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ - "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |" + nnet3-compute-prob {compute_prob_opts} "{model}" \ + "{egs_string}" """.format(command = run_opts.command, dir = dir, iter = iter, mb_size = mb_size, model = model, - egs_dir = egs_dir), wait = wait) - - -def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False): + compute_prob_opts = compute_prob_opts, + egs_dir = egs_dir, + egs_string = train_egs_for_compute_prob_str), wait = wait) + +def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False, + get_raw_nnet_from_am = True): + if get_raw_nnet_from_am: + prev_model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter - 1) + model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter) + else: + prev_model = '{0}/{1}.raw'.format(dir, iter - 1) + model = '{0}/{1}.raw'.format(dir, iter) - prev_model = '{0}/{1}.mdl'.format(dir, iter - 1) - model = '{0}/{1}.mdl'.format(dir, iter) RunKaldiCommand(""" {command} {dir}/log/progress.{iter}.log \ -nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \ -nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \ +nnet3-info {model} '&&' \ +nnet3-show-progress --use-gpu=no {prev_model} {model} \ "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|" """.format(command = run_opts.command, dir = dir, @@ -551,7 +809,8 @@ def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False): egs_dir = egs_dir), wait = wait) def CombineModels(dir, num_iters, num_iters_combine, egs_dir, - run_opts, chunk_width = None): + run_opts, chunk_width = None, + get_raw_nnet_from_am = True, compute_accuracy = True): # Now do combination. In the nnet3 setup, the logic # for doing averaging of subsets of the models in the case where # there are too many models to reliably esetimate interpolation @@ -559,10 +818,16 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir, raw_model_strings = [] print num_iters_combine for iter in range(num_iters - num_iters_combine + 1, num_iters + 1): - model_file = '{0}/{1}.mdl'.format(dir, iter) - if not os.path.exists(model_file): - raise Exception('Model file {0} missing'.format(model_file)) - raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file)) + if get_raw_nnet_from_am: + model_file = '{0}/{1}.mdl'.format(dir, iter) + if not os.path.exists(model_file): + raise Exception('Model file {0} missing'.format(model_file)) + raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file)) + else: + model_file = '{0}/{1}.raw'.format(dir, iter) + if not os.path.exists(model_file): + raise Exception('Model file {0} missing'.format(model_file)) + raw_model_strings.append(model_file) if chunk_width is not None: # this is an RNN model @@ -570,26 +835,37 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir, else: mbsize = 1024 + if get_raw_nnet_from_am: + out_model = "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir = dir, num_iters = num_iters) + else: + out_model = '{dir}/final.raw'.format(dir = dir) + RunKaldiCommand(""" {command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-combine --num-iters=40 \ --enforce-sum-to-one=true --enforce-positive-weights=true \ --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \ -"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl" - """.format(command = run_opts.command, + {out_model} + """.format(command = run_opts.command, combine_queue_opt = run_opts.combine_queue_opt, dir = dir, raw_models = " ".join(raw_model_strings), mbsize = mbsize, - num_iters = num_iters, + out_model = out_model, egs_dir = egs_dir)) - # Compute the probability of the final, combined model with - # the same subset we used for the previous compute_probs, as the - # different subsets will lead to different probs. - ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False) + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + if get_raw_nnet_from_am: + ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False) + else: + ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts, + wait = False, get_raw_nnet_from_am = False, + compute_accuracy = compute_accuracy) def ComputeAveragePosterior(dir, iter, egs_dir, num_archives, - prior_subset_size, run_opts): + prior_subset_size, run_opts, + get_raw_nnet_from_am = True): # Note: this just uses CPUs, using a smallish subset of data. """ Computes the average posterior of the network""" import glob @@ -601,15 +877,20 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives, else: egs_part = 'JOB' + if get_raw_nnet_from_am: + model = "nnet3-am-copy --raw=true {dir}/combined.mdl -|".format(dir = dir) + else: + model = "{dir}/final.raw".format(dir = dir) + RunKaldiCommand(""" {command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \ nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \ nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \ nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \ - "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \ + {model} ark:- ark:- \| \ matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec """.format(command = run_opts.command, - dir = dir, + dir = dir, model = model, num_jobs_compute_prior = run_opts.num_jobs_compute_prior, prior_queue_opt = run_opts.prior_queue_opt, iter = iter, prior_subset_size = prior_subset_size, @@ -643,25 +924,32 @@ def RemoveEgs(egs_dir): def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None, preserve_model_interval = 100, - remove_egs = True): + remove_egs = True, + get_raw_nnet_from_am = True): try: if remove_egs: RemoveEgs(egs_dir) for iter in range(num_iters): RemoveModel(nnet_dir, iter, num_iters, 1, - preserve_model_interval) + preserve_model_interval, + get_raw_nnet_from_am = get_raw_nnet_from_am) except (IOError, OSError) as err: logger.warning("Error while cleaning up the nnet directory") raise err def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None, - preserve_model_interval = 100): + preserve_model_interval = 100, + get_raw_nnet_from_am = True): if iter % preserve_model_interval == 0: return if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 : return - file_name = '{0}/{1}.mdl'.format(nnet_dir, iter) + if get_raw_nnet_from_am: + file_name = '{0}/{1}.mdl'.format(nnet_dir, iter) + else: + file_name = '{0}/{1}.raw'.format(nnet_dir, iter) + if os.path.isfile(file_name): os.remove(file_name) diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index ea8f41749da..f1c489f4ca0 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -47,7 +47,7 @@ def GetArgs(): """) parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables") parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1) - parser.add_argument("--is-chain", type=str, default = False, action = train_lib.StrToBoolAction, help="Iteration from which plotting will start") + parser.add_argument("--objective-type", type=str, default="linear", choices=["linear","quadratic","chain"], help="Objective function used during training -- determines which plots are to be plotted."); parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn") parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report") @@ -422,7 +422,7 @@ def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None, if latex_report is not None: latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name)) -def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False): +def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, objective_type = "linear"): try: os.makedirs(output_dir) except OSError as e: @@ -435,15 +435,18 @@ def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is else: latex_report = None - if is_chain: + if objective_type == "chain": logger.info("Generating log-probability plots") GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) - else: + elif objective_type == "linear": logger.info("Generating accuracy plots") GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) logger.info("Generating log-likelihood plots") GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + else: + logger.info("Generating " + objective_type + " objective plots") + GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'objective', file_basename = 'objective', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) logger.info("Generating non-linearity stats plots") GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) @@ -465,7 +468,7 @@ def Main(): GeneratePlots(args.exp_dir, args.output_dir, comparison_dir = args.comparison_dir, start_iter = args.start_iter, - is_chain = args.is_chain) + objective_type = args.objective_type) if __name__ == "__main__": Main() diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py index bac260e93bc..d79be683cac 100755 --- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py @@ -43,6 +43,8 @@ def GetArgs(): help="alignment directory, from which we derive the num-targets") num_target_group.add_argument("--tree-dir", type=str, help="directory with final.mdl, from which we derive the num-targets") + num_target_group.add_argument("--num-multiple-targets", type=str, + help="space separated number of network targets for different languages(e.g. num-pdf-ids/num-leaves e.g. '1000 2000 3000')") # CNN options parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer", @@ -54,10 +56,6 @@ def GetArgs(): help="Output dimension of the linear layer at the CNN output " "for dimension reduction, e.g. 256." "The default zero means this layer is not needed.", default=0) - parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter", - help="The factor used for determining the liftering vector in the production of MFCC. " - "User has to ensure that it matches the lifter used in MFCC generation, " - "e.g. 22.0", default=22.0) # General neural network options parser.add_argument("--splice-indexes", type=str, required = True, @@ -69,6 +67,8 @@ def GetArgs(): help="If \"true\" an LDA matrix computed from the input features " "(spliced according to the first set of splice-indexes) will be used as " "the first Affine layer. This affine layer's parameters are fixed during training. " + "This variable needs to be set to \"false\" when using dense-targets " + "or when --add-idct is set to \"true\"." "If --cnn.layer is specified this option will be forced to \"false\".", default=True, choices = ["false", "true"]) @@ -116,9 +116,26 @@ def GetArgs(): parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction, help="if true, a presoftmax-prior-scale is added", choices=['true', 'false'], default = True) + + # Options to convert input MFCC into Fbank features. This is useful when a + # LDA layer is not added (such as when using dense targets) + parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter", + help="The factor used for determining the liftering vector in the production of MFCC. " + "User has to ensure that it matches the lifter used in MFCC generation, " + "e.g. 22.0", default=22.0) + + parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction, + help="Add an IDCT after input to convert MFCC to Fbank", default = False) parser.add_argument("config_dir", help="Directory to write config files and variables") - + # multilingual tdnn with bn layer config + parser.add_argument("--bottleneck-layer", type=int, + help="The layer number to add bottleneck layer," + "if < 0, means this layer is not needed in network.", + default=-1) + parser.add_argument("--bottleneck-dim", type=int, + help="The bottleneck layer dimension in TDNN network e.g. 42.", + default=40) print(' '.join(sys.argv)) args = parser.parse_args() @@ -145,9 +162,13 @@ def CheckArgs(args): if not args.feat_dim > 0: raise Exception("feat-dim has to be postive") + if args.add_lda and args.add_idct: + raise Exception("add-idct can be true only if add-lda is false") + if not args.num_targets > 0: - print(args.num_targets) - raise Exception("num_targets has to be positive") + if args.num_multiple_targets is None: + print(args.num_targets) + raise Exception("num_targets or num_multiple_targets has to be positive") if not args.ivector_dim >= 0: raise Exception("ivector-dim has to be non-negative") @@ -323,7 +344,7 @@ def ParseSpliceString(splice_indexes): # The function signature of MakeConfigs is changed frequently as it is intended for local use in this script. def MakeConfigs(config_dir, splice_indexes_string, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, - feat_dim, ivector_dim, num_targets, add_lda, + feat_dim, ivector_dim, num_targets, add_lda, add_idct, nonlin_type, nonlin_input_dim, nonlin_output_dim, subset_dim, nonlin_output_dim_init, nonlin_output_dim_final, use_presoftmax_prior_scale, @@ -333,7 +354,8 @@ def MakeConfigs(config_dir, splice_indexes_string, xent_regularize, xent_separate_forward_affine, self_repair_scale, - objective_type): + objective_type, + num_multiple_targets, bottleneck_layer, bottleneck_dim): parsed_splice_output = ParseSpliceString(splice_indexes_string.strip()) @@ -351,8 +373,14 @@ def MakeConfigs(config_dir, splice_indexes_string, config_lines = {'components':[], 'component-nodes':[]} + if add_idct and cnn_layer is None: + # If CNN layer is not None, IDCT will be add inside AddCnnLayers method + nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat") + config_files={} - prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) + prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], + ivector_dim, + idct_mat = config_dir.strip() + "/idct.mat" if (add_idct and cnn_layer is None) else None) # Add the init config lines for estimating the preconditioning matrices init_config_lines = copy.deepcopy(config_lines) @@ -365,6 +393,9 @@ def MakeConfigs(config_dir, splice_indexes_string, prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes[0], ivector_dim) + # add_lda needs to be set "false" when using dense targets, + # or if the task is not a simple classification task + # (e.g. regression, multi-task) if add_lda: prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') @@ -387,7 +418,15 @@ def MakeConfigs(config_dir, splice_indexes_string, for i in range(0, num_hidden_layers): # make the intermediate config file for layerwise discriminative training - + bnf_suffix="" + if bottleneck_layer > -1 and i+1 == bottleneck_layer: + print('bottleneck layer and its dimension are {0} and {1} respectively.'.format(bottleneck_layer, bottleneck_dim)) + nonlin_output_layer_dim = bottleneck_dim + bnf_suffix = "_Bottleneck" + elif nonlin_type == "relu": + nonlin_output_layer_dim = nonlin_output_dims[i] + elif nonlin_type == "pnorm": + nonlin_output_layer_dim = nonlin_output_dim # prepare the spliced input if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0): try: @@ -425,21 +464,21 @@ def MakeConfigs(config_dir, splice_indexes_string, if nonlin_type == "relu" : prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain", - prev_layer_output, nonlin_output_dim, + prev_layer_output, nonlin_output_layer_dim, self_repair_scale = self_repair_scale, norm_target_rms = final_layer_normalize_target) prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent", - prev_layer_output, nonlin_output_dim, + prev_layer_output, nonlin_output_layer_dim, self_repair_scale = self_repair_scale, norm_target_rms = final_layer_normalize_target) elif nonlin_type == "pnorm" : prev_layer_output_chain = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_chain", - prev_layer_output, nonlin_input_dim, nonlin_output_dim, + prev_layer_output, nonlin_input_dim, nonlin_output_layer_dim, norm_target_rms = final_layer_normalize_target) prev_layer_output_xent = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_xent", - prev_layer_output, nonlin_input_dim, nonlin_output_dim, + prev_layer_output, nonlin_input_dim, nonlin_output_layer_dim, norm_target_rms = final_layer_normalize_target) else: raise Exception("Unknown nonlinearity type") @@ -458,39 +497,59 @@ def MakeConfigs(config_dir, splice_indexes_string, name_affix = 'xent') else: if nonlin_type == "relu": - prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i), - prev_layer_output, nonlin_output_dims[i], + prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn{1}_{0}".format(i, bnf_suffix), + prev_layer_output, nonlin_output_layer_dim, self_repair_scale = self_repair_scale, norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) elif nonlin_type == "pnorm": - prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_{0}".format(i), - prev_layer_output, nonlin_input_dim, nonlin_output_dim, + prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn{1}_{0}".format(i, bnf_suffix), + prev_layer_output, nonlin_input_dim, nonlin_output_layer_dim, norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) else: raise Exception("Unknown nonlinearity type") - # a final layer is added after each new layer as we are generating - # configs for layer-wise discriminative training - - # add_final_sigmoid adds a sigmoid as a final layer as alternative - # to log-softmax layer. - # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers - # This is useful when you need the final outputs to be probabilities between 0 and 1. - # Usually used with an objective-type such as "quadratic". - # Applications are k-binary classification such Ideal Ratio Mask prediction. - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, - use_presoftmax_prior_scale = use_presoftmax_prior_scale, - prior_scale_file = prior_scale_file, - include_log_softmax = include_log_softmax, - add_final_sigmoid = add_final_sigmoid, - objective_type = objective_type) - if xent_regularize != 0.0: - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, - ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( - 0.5 / xent_regularize), + + if len(num_multiple_targets) > 1: + for target in range(len(num_multiple_targets)): + nodes.AddFinalLayer(config_lines, prev_layer_output, + num_multiple_targets[target], + name_affix = str(target), use_presoftmax_prior_scale = use_presoftmax_prior_scale, prior_scale_file = prior_scale_file, - include_log_softmax = True, - name_affix = 'xent') + include_log_softmax = include_log_softmax, + add_final_sigmoid = add_final_sigmoid, + objective_type = objective_type) + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_multiple_targets[target], + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent-output-'+str(target)) + else: + # a final layer is added after each new layer as we are generating + # configs for layer-wise discriminative training + + # add_final_sigmoid adds a sigmoid as a final layer as alternative + # to log-softmax layer. + # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers + # This is useful when you need the final outputs to be probabilities between 0 and 1. + # Usually used with an objective-type such as "quadratic". + # Applications are k-binary classification such Ideal Ratio Mask prediction. + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax, + add_final_sigmoid = add_final_sigmoid, + objective_type = objective_type) + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent') config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines config_lines = {'components':[], 'component-nodes':[]} @@ -516,12 +575,15 @@ def MakeConfigs(config_dir, splice_indexes_string, def Main(): args = GetArgs() + if args.num_multiple_targets is not None: + num_multiple_targets = args.num_multiple_targets.split() + print('Number of output targets is {0}'.format(len(num_multiple_targets))) MakeConfigs(config_dir = args.config_dir, splice_indexes_string = args.splice_indexes, feat_dim = args.feat_dim, ivector_dim = args.ivector_dim, num_targets = args.num_targets, - add_lda = args.add_lda, + add_lda = args.add_lda, add_idct = args.add_idct, cnn_layer = args.cnn_layer, cnn_bottleneck_dim = args.cnn_bottleneck_dim, cepstral_lifter = args.cepstral_lifter, @@ -538,7 +600,10 @@ def Main(): xent_regularize = args.xent_regularize, xent_separate_forward_affine = args.xent_separate_forward_affine, self_repair_scale = args.self_repair_scale_nonlinearity, - objective_type = args.objective_type) + objective_type = args.objective_type, + num_multiple_targets = num_multiple_targets, + bottleneck_layer = args.bottleneck_layer, + bottleneck_dim = args.bottleneck_dim) if __name__ == "__main__": Main() diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index a3764b88492..fc137a87d62 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -2,10 +2,11 @@ # Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar # Apache 2.0. -# this script is based on steps/nnet3/lstm/train.sh +# this script is based on steps/nnet3/tdnn/train.sh import subprocess @@ -17,7 +18,8 @@ import traceback from nnet3_train_lib import * -nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py') +nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py') +train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py') logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -35,170 +37,27 @@ def GetArgs(): Trains a feed forward DNN acoustic model using the cross-entropy objective. DNNs include simple DNNs, TDNNs and CNNs. """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - # feat options - parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', - default = None, action = NullstrToNoneAction, - help="""directory with the ivectors extracted in - an online fashion.""") - parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', - default = None, action = NullstrToNoneAction, - help="A string specifying '--norm-means' and '--norm-vars' values") - - # egs extraction options + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler = 'resolve') + + train_lib.AddCommonTrainArgs(parser) + parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg', default = 8, help="Number of output labels per example") - parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', - default = None, action = NullstrToNoneAction, - help="""String to provide options directly to steps/nnet3/get_egs.sh script""") - parser.add_argument("--egs.dir", type=str, dest='egs_dir', - default = None, action = NullstrToNoneAction, - help="""Directory with egs. If specified this directory - will be used rather than extracting egs""") - parser.add_argument("--egs.stage", type=int, dest='egs_stage', - default = 0, help="Stage at which get_egs.sh should be restarted") - parser.add_argument("--egs.opts", type=str, dest='egs_opts', - default = None, action = NullstrToNoneAction, - help="""String to provide options directly to steps/nnet3/get_egs.sh script""") - - # trainer options - parser.add_argument("--trainer.srand", type=int, dest='srand', - default = 0, - help="Sets the random seed for model initialization and egs shuffling. " - "Warning: This random seed does not control all aspects of this experiment. " - "There might be other random seeds used in other stages of the experiment " - "like data preparation (e.g. volume perturbation).") - parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', - default = 8, - help="Number of epochs to train the model") - parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', - default = 20000, - help="Number of samples for computing priors") - parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', - default = 10, - help="The prior computation jobs are single threaded and run on the CPU") - parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', - default = 20, - help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges") - parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', - default = 5000, - help="Controls randomization of the samples on each" - "iteration. If 0 or a large value the randomization is" - "complete, but this will consume memory and cause spikes" - "in disk I/O. Smaller is easier on disk and memory but" - "less random. It's not a huge deal though, as samples" - "are anyway randomized right at the start." - "(the point of this is to get data in different" - "minibatches on different iterations, since in the" - "preconditioning method, 2 samples in the same minibatch" - "can affect each others' gradients.") - parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', - default=2, - help="The number of iterations between adding layers" - "during layer-wise discriminative training.") - parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', - default=2.0, - help="The maximum change in parameters allowed per minibatch," - "measured in Frobenius norm over the entire model") - parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', - default=400000, - help="This is really the number of egs in each archive.") - parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', - default=4.0, - help="""Value used in preconditioning matrix estimation""") - parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', - default=10, - help="""Max number of jobs used for LDA stats accumulation""") - parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power', - default=-0.25, - help="") - # Realignment parameters - parser.add_argument("--trainer.realign.command", type=str, dest='realign_command', - default=None, action=NullstrToNoneAction, - help="""Command to be used with steps/nnet3/align.sh during realignment""") - parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs', - default=30, - help="Number of jobs to use for realignment") - parser.add_argument("--trainer.realign.times", type=str, dest='realign_times', - default=None, action=NullstrToNoneAction, - help="""A space seperated string of realignment - times. Values must be between 0 and 1 - e.g. '0.1 0.2 0.3' """) - - parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu', - default=True, action=StrToBoolAction, - choices = ["true", "false"], - help="If true, gpu is used with steps/nnet3/align.sh") - - # Parameters for the optimization parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size', default = 512, help="Size of the minibatch used to compute the gradient") - parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', - default = 0.0003, - help="Learning rate used during the initial iteration") - parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', - default = 0.00003, - help="Learning rate used during the final iteration") - parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', - default = 1, - help="Number of neural net jobs to run in parallel at the start of training") - parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', - default = 8, - help="Number of neural net jobs to run in parallel at the end of training") - parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', - default = 20, - help = """ The is the maximum number of models we give to the - final 'combine' stage, but these models will themselves - be averages of iteration-number ranges. """) - parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', - default = 0.0, - help="""Momentum used in update computation. - Note: we implemented it in such a way that - it doesn't increase the effective learning rate.""") - # General options - parser.add_argument("--stage", type=int, default=-4, - help="Specifies the stage of the experiment to execution from") - parser.add_argument("--exit-stage", type=int, default=None, - help="If specified, training exits before running this stage") - parser.add_argument("--cmd", type=str, action = NullstrToNoneAction, - dest = "command", - help="""Specifies the script to launch jobs. - e.g. queue.pl for launching on SGE cluster - run.pl for launching on local machine - """, default = "queue.pl") - parser.add_argument("--use-gpu", type=str, action = StrToBoolAction, - choices = ["true", "false"], - help="Use GPU for training", default=True) - parser.add_argument("--cleanup", type=str, action = StrToBoolAction, - choices = ["true", "false"], - help="Clean up models after training", default=True) - parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', - default = True, action = StrToBoolAction, - choices = ["true", "false"], - help="""If true, remove egs after experiment""") - parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", - type=int, default=100, - help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.") - - parser.add_argument("--reporting.email", dest = "email", - type=str, default=None, action = NullstrToNoneAction, - help=""" Email-id to report about the progress of the experiment. - NOTE: It assumes the machine on which the script is being run can send - emails from command line via. mail program. The - Kaldi mailing list will not support this feature. - It might require local expertise to setup. """) - parser.add_argument("--reporting.interval", dest = "reporting_interval", - type=int, default=0.1, - help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") + parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power', + default=-0.25, + help="") + # General options parser.add_argument("--feat-dir", type=str, required = True, help="Directory with features used for training the neural network.") parser.add_argument("--lang", type=str, required = True, - help="Languade directory") + help="Language directory") parser.add_argument("--ali-dir", type=str, required = True, help="Directory with alignments used for training the neural network.") parser.add_argument("--dir", type=str, required = True, @@ -223,8 +82,9 @@ def ProcessArgs(args): if args.transform_dir is None: args.transform_dir = args.ali_dir + # set the options corresponding to args.use_gpu - run_opts = RunOpts() + run_opts = train_lib.RunOpts() if args.use_gpu: if not CheckIfCudaCompiled(): logger.warning(""" @@ -248,197 +108,12 @@ def ProcessArgs(args): run_opts.prior_gpu_opt = "--use-gpu=no" run_opts.prior_queue_opt = "" - if args.realign_use_gpu is True: - run_opts.realign_use_gpu = True - run_opts.realign_queue_opt = "--gpu 1" - else: - run_opts.realign_use_gpu = False - run_opts.realign_queue_opt = "" - - if args.realign_command is None: - run_opts.realign_command = args.command - else: - run_opts.realign_command = args.realign_command - run_opts.realign_num_jobs = args.realign_num_jobs - run_opts.command = args.command + run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior return [args, run_opts] -# a class to store run options -class RunOpts: - def __init__(self): - self.command = None - self.train_queue_opt = None - self.combine_queue_opt = None - self.prior_gpu_opt = None - self.prior_queue_opt = None - self.parallel_train_opts = None - self.realign_use_gpu = None - -# this is the main method which differs between RNN and DNN training -def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, frames_per_eg, - left_context, right_context, - momentum, max_param_change, - shuffle_buffer_size, minibatch_size, - run_opts): - # We cannot easily use a single parallel SGE job to do the main training, - # because the computation of which archive and which --frame option - # to use for each job is a little complex, so we spawn each one separately. - # this is no longer true for RNNs as we use do not use the --frame option - # but we use the same script for consistency with FF-DNN code - - context_opts="--left-context={0} --right-context={1}".format( - left_context, right_context) - processes = [] - for job in range(1,num_jobs+1): - k = num_archives_processed + job - 1 # k is a zero-based index that we will derive - # the other indexes from. - archive_index = (k % num_archives) + 1 # work out the 1-based archive index. - frame = (k / num_archives) % frames_per_eg - process_handle = RunKaldiCommand(""" -{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ - nnet3-train {parallel_train_opts} \ - --print-interval=10 --momentum={momentum} \ - --max-param-change={max_param_change} \ - "{raw_model}" \ - "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ - {dir}/{next_iter}.{job}.raw - """.format(command = run_opts.command, - train_queue_opt = run_opts.train_queue_opt, - dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job, - parallel_train_opts = run_opts.parallel_train_opts, - frame = frame, - momentum = momentum, max_param_change = max_param_change, - raw_model = raw_model_string, context_opts = context_opts, - egs_dir = egs_dir, archive_index = archive_index, - shuffle_buffer_size = shuffle_buffer_size, - minibatch_size = minibatch_size), - wait = False) - - processes.append(process_handle) - - all_success = True - for process in processes: - process.wait() - [stdout_value, stderr_value] = process.communicate() - print(stderr_value) - if process.returncode != 0: - all_success = False - - if not all_success: - open('{0}/.error'.format(dir), 'w').close() - raise Exception("There was error during training iteration {0}".format(iter)) - -def TrainOneIteration(dir, iter, srand, egs_dir, - num_jobs, num_archives_processed, num_archives, - learning_rate, minibatch_size, - frames_per_eg, num_hidden_layers, add_layers_period, - left_context, right_context, - momentum, max_param_change, shuffle_buffer_size, - run_opts): - - - - # Set off jobs doing some diagnostics, in the background. - # Use the egs dir from the previous iteration for the diagnostics - logger.info("Training neural net (pass {0})".format(iter)) - - # check if different iterations use the same random seed - if os.path.exists('{0}/srand'.format(dir)): - try: - saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip()) - except IOError, ValueError: - raise Exception('Exception while reading the random seed for training') - if srand != saved_srand: - logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand)) - else: - f = open('{0}/srand'.format(dir), 'w') - f.write(str(srand)) - f.close() - - ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts) - - if iter > 0: - ComputeProgress(dir, iter, egs_dir, run_opts) - - if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): - - do_average = False # if we've just mixed up, don't do averaging but take the - # best. - cur_num_hidden_layers = 1 + iter / add_layers_period - config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) - raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file ) - else: - do_average = True - if iter == 0: - do_average = False # on iteration 0, pick the best, don't average. - raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) - - if do_average: - cur_minibatch_size = minibatch_size - cur_max_param_change = max_param_change - else: - # on iteration zero or when we just added a layer, use a smaller minibatch - # size (and we will later choose the output of just one of the jobs): the - # model-averaging isn't always helpful when the model is changing too fast - # (i.e. it can worsen the objective function), and the smaller minibatch - # size will help to keep the update stable. - cur_minibatch_size = minibatch_size // 2 - cur_max_param_change = float(max_param_change) / math.sqrt(2) - - try: - os.remove("{0}/.error".format(dir)) - except OSError: - pass - - TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, frames_per_eg, - left_context, right_context, - momentum, max_param_change, - shuffle_buffer_size, cur_minibatch_size, - run_opts) - [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) - nnets_list = [] - for n in models_to_average: - nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) - - if do_average: - # average the output of the different jobs. - RunKaldiCommand(""" -{command} {dir}/log/average.{iter}.log \ -nnet3-average {nnet_list} - \| \ -nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl - """.format(command = run_opts.command, - dir = dir, - iter = iter, - nnet_list = " ".join(nnets_list), - new_iter = iter + 1)) - - else: - # choose the best model from different jobs - RunKaldiCommand(""" -{command} {dir}/log/select.{iter}.log \ - nnet3-am-copy --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw {dir}/{iter}.mdl {dir}/{next_iter}.mdl - """.format(command = run_opts.command, - dir = dir, iter = iter, next_iter = iter + 1, - best_model_index = best_model)) - - try: - for i in range(1, num_jobs + 1): - os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) - except OSError: - raise Exception("Error while trying to delete the raw models") - - new_model = "{0}/{1}.mdl".format(dir, iter + 1) - - if not os.path.isfile(new_model): - raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) - elif os.stat(new_model).st_size == 0: - raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) - # args is a Namespace with the required parameters def Train(args, run_opts): arg_string = pprint.pformat(vars(args)) @@ -461,7 +136,17 @@ def Train(args, run_opts): config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) - [left_context, right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file) + variables = ParseGenericConfigVarsFile(var_file) + + # Set some variables. + + try: + left_context = variables['model_left_context'] + right_context = variables['model_right_context'] + num_hidden_layers = variables['num_hidden_layers'] + except KeyError as e: + raise Exception("KeyError {0}: Variables need to be defined in {1}".format( + str(e), '{0}/configs'.format(args.dir))) # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' @@ -545,15 +230,6 @@ def Train(args, run_opts): num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) - realign_iters = [] - if args.realign_times is not None: - realign_iters = GetRealignIters(args.realign_times, - num_iters, - args.num_jobs_initial, - args.num_jobs_final) - print(realign_iters) - # egs_dir will be updated if there is realignment - cur_egs_dir=egs_dir logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): @@ -563,29 +239,28 @@ def Train(args, run_opts): current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: - if iter in realign_iters: - logger.info("Re-aligning the data at iteration {0}".format(iter)) - prev_egs_dir=cur_egs_dir - cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter)) - new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter)) - Realign(args.dir, iter, args.feat_dir, args.lang, - prev_egs_dir, cur_egs_dir, - args.prior_subset_size, num_archives, run_opts, - transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir) - if args.cleanup and args.egs_dir is None: - RemoveEgs(prev_egs_dir) model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed))) - TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs, - num_archives_processed, num_archives, - learning_rate(iter, current_num_jobs, num_archives_processed), - args.minibatch_size, args.frames_per_eg, - num_hidden_layers, args.add_layers_period, - left_context, right_context, - args.momentum, args.max_param_change, - args.shuffle_buffer_size, run_opts) + train_lib.TrainOneIteration(dir = args.dir, + iter = iter, + srand = args.srand, + egs_dir = egs_dir, + num_jobs = current_num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), + minibatch_size = args.minibatch_size, + frames_per_eg = args.frames_per_eg, + num_hidden_layers = num_hidden_layers, + add_layers_period = args.add_layers_period, + left_context = left_context, + right_context = right_context, + momentum = args.momentum, + max_param_change = args.max_param_change, + shuffle_buffer_size = args.shuffle_buffer_size, + run_opts = run_opts) if args.cleanup: # do a clean up everythin but the last 2 models, under certain conditions RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, @@ -598,7 +273,7 @@ def Train(args, run_opts): [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) message = report subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) - sendMail(message, subject, args.email) + SendMail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs @@ -624,7 +299,7 @@ def Train(args, run_opts): # delete it remove_egs = False - CleanNnetDir(args.dir, num_iters, cur_egs_dir, + CleanNnetDir(args.dir, num_iters, egs_dir, preserve_model_interval = args.preserve_model_interval, remove_egs = remove_egs) @@ -646,7 +321,7 @@ def Main(): except Exception as e: if args.email is not None: message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) - sendMail(message, message, args.email) + SendMail(message, message, args.email) traceback.print_exc() raise e diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py new file mode 100755 index 00000000000..f3a11cfcc94 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + + +# this script is based on steps/nnet3/tdnn/train_raw_nnet.sh + + +import subprocess +import argparse +import sys +import pprint +import logging +import imp +import traceback +import os.path +from nnet3_train_lib import * + +nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py') +train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting raw DNN trainer (train_raw_dnn.py)') + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + Trains a feed forward raw DNN (without transition model) + using the cross-entropy objective. + DNNs include simple DNNs, TDNNs and CNNs. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler = 'resolve') + + train_lib.AddCommonTrainArgs(parser) + + parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg', + default = 8, + help="Number of output labels per example") + + parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size', + default = 512, + help="Size of the minibatch used to compute the gradient") + + # General options + parser.add_argument("--nj", type=int, default=4, + help="Number of parallel jobs") + + parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction, + default = True, choices = ["true", "false"], + help="Train neural network using dense targets") + parser.add_argument("--feat-dir", type=str, required = True, + help="Directory with features used for training the neural network.") + parser.add_argument("--targets-scp", type=str, + help="Target for training neural network.") + parser.add_argument("--dir", type=str, required = True, + help="Directory to store the models and all other files.") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + [args, run_opts] = ProcessArgs(args) + + return [args, run_opts] + +def ProcessArgs(args): + # process the options + if args.frames_per_eg < 1: + raise Exception("--egs.frames-per-eg should have a minimum value of 1") + + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): + raise Exception("""This scripts expects {0} to exist and have a configs + directory which is the output of make_configs.py script""") + + # set the options corresponding to args.use_gpu + run_opts = train_lib.RunOpts() + if args.use_gpu: + if not CheckIfCudaCompiled(): + logger.warning(""" + You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. If you have + GPUs and have nvcc installed, go to src/ and do ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_queue_opt = "--gpu 1" + + else: + logger.warning(""" + Without using a GPU this will be very slow. nnet3 does not yet support multiple threads.""") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + run_opts.prior_gpu_opt = "--use-gpu=no" + run_opts.prior_queue_opt = "" + + run_opts.command = args.command + run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command + run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior + + return [args, run_opts] + +# args is a Namespace with the required parameters +def Train(args, run_opts): + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Set some variables. + feat_dim = GetFeatDim(args.feat_dir) + ivector_dim = GetIvectorDim(args.online_ivector_dir) + + # split the training data into parts for individual jobs + SplitData(args.feat_dir, args.nj) + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + variables = ParseGenericConfigVarsFile(var_file) + + # Set some variables. + + try: + left_context = variables['model_left_context'] + right_context = variables['model_right_context'] + num_hidden_layers = variables['num_hidden_layers'] + if variables['num_targets'] != 'None': + num_targets = int(variables['num_targets']) + add_lda = StrToBool(variables['add_lda']) + include_log_softmax = StrToBool(variables['include_log_softmax']) + objective_type = variables['objective_type'] + except KeyError as e: + raise Exception("KeyError {0}: Variables need to be defined in {1}".format( + str(e), '{0}/configs'.format(args.dir))) + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + + if args.use_dense_targets: + if GetFeatDimFromScp(targets_scp) != num_targets: + raise Exception("Mismatch between num-targets provided to " + "script vs configs") + + if (args.stage <= -5): + logger.info("Initializing a basic network for estimating preconditioning matrix") + RunKaldiCommand(""" +{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw + """.format(command = run_opts.command, + dir = args.dir)) + + default_egs_dir = '{0}/egs'.format(args.dir) + + if args.use_dense_targets: + target_type = "dense" + compute_accuracy = False + else: + target_type = "sparse" + compute_accuracy = True if objective_type == "linear" else False + + # If num of egs dirs in args.egs_dir > 1, + # it is correspond to multilingual training and + # it should generate egs dir for each languages. + # The last dir corresponds to multilingual egs directory, + # that is generated using this script, but + # it requires single egs dirs to exist. + multi_egs_dir = args.egs_dir.split() + if len(multi_egs_dir) == 1: + default_egs_dir = '{0}/egs'.format(args.dir) + if (args.stage <= -4) and args.egs_dir is None: + logger.info("Generating egs") + + GenerateEgsFromTargets(args.feat_dir, args.targets_scp, default_egs_dir, + left_context, right_context, + left_context, right_context, run_opts, + frames_per_eg = args.frames_per_eg, + egs_opts = args.egs_opts, + cmvn_opts = args.cmvn_opts, + online_ivector_dir = args.online_ivector_dir, + samples_per_iter = args.samples_per_iter, + transform_dir = args.transform_dir, + stage = args.egs_stage, + target_type = target_type, + num_targets = num_targets) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + else: + egs_dir = multi_egs_dir[-1] + #if (args.stage <= -4) and not os.path.exists(egs_dir): + if (args.stage <= -4): + logger.info("Generating multilingual egs dir") + GenerateMultilingualEgs(args.egs_dir, run_opts, + stage = args.egs_stage, + samples_per_iter = args.samples_per_iter, + egs_opts = args.egs_opts) + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + assert(args.frames_per_eg == frames_per_eg) + + if (args.num_jobs_final > num_archives): + raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory') + + # copy the properties of the egs to dir for + # use during decoding + CopyEgsPropertiesToExpDir(egs_dir, args.dir) + + if (add_lda and args.stage <= -3): + logger.info('Computing the preconditioning matrix for input features') + + ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs = args.max_lda_jobs, + rand_prune = args.rand_prune) + + + if (args.stage <= -1): + logger.info("Preparing the initial network.") + PrepareInitialNetwork(args.dir, run_opts) + + + # set num_iters so that as close as possible, we process the data $num_epochs + # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, + # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_expanded = num_archives * args.frames_per_eg + num_archives_to_process = args.num_epochs * num_archives_expanded + num_archives_processed = 0 + num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final) + + num_iters_combine = VerifyIterations(num_iters, args.num_epochs, + num_hidden_layers, num_archives_expanded, + args.max_models_combine, args.add_layers_period, + args.num_jobs_final) + + learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + + if args.stage <= iter: + model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) + + logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed))) + + train_lib.TrainOneIteration(dir = args.dir, + iter = iter, + srand = args.srand, + egs_dir = egs_dir, + num_jobs = current_num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), + minibatch_size = args.minibatch_size, + frames_per_eg = args.frames_per_eg, + num_hidden_layers = num_hidden_layers, + add_layers_period = args.add_layers_period, + left_context = left_context, + right_context = right_context, + momentum = args.momentum, + max_param_change = args.max_param_change, + shuffle_buffer_size = args.shuffle_buffer_size, + run_opts = run_opts, + compute_accuracy = compute_accuracy, + get_raw_nnet_from_am = False) + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain conditions + RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, + args.preserve_model_interval, get_raw_nnet_from_am = False) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + message = report + subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) + SendMail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.mdl") + CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, + get_raw_nnet_from_am = False, compute_accuracy = compute_accuracy) + + if include_log_softmax and args.stage <= num_iters + 1: + logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.") + avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir, + num_archives, args.prior_subset_size, run_opts, get_raw_nnet_from_am = False) + + if args.cleanup: + logger.info("Cleaning up the experiment directory {0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + CleanNnetDir(args.dir, num_iters, egs_dir, + preserve_model_interval = args.preserve_model_interval, + remove_egs = remove_egs, + get_raw_nnet_from_am = False) + + # do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + if args.email is not None: + SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + + report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") + report_handle.write(report) + report_handle.close() + + os.system("steps/info/nnet3_dir_info.pl " + args.dir) + +def Main(): + [args, run_opts] = GetArgs() + try: + Train(args, run_opts) + except Exception as e: + if args.email is not None: + message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) + SendMail(message, message, args.email) + traceback.print_exc() + raise e + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py new file mode 100755 index 00000000000..5842e63474e --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python + +# Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar +# Apache 2.0. + +# this script is based on steps/nnet3/lstm/train.sh + +import subprocess +import argparse +import sys +import pprint +import logging +import imp +import traceback +from nnet3_train_lib import * + +nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py') +rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py') +train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting RNN trainer (train_raw_rnn.py)') + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + Trains an RNN neural network using the cross-entropy objective. + RNNs include LSTMs, BLSTMs and GRUs. + RNN acoustic model training differs from feed-forward DNN training + in the following ways + 1. RNN acoustic models train on output chunks rather than individual + outputs + 2. The training includes additional stage of shrinkage, where + the parameters of the model are scaled when the derivative averages + at the non-linearities are below a threshold. + 3. RNNs can also be trained with state preservation training + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler = 'resolve') + + train_lib.AddCommonTrainArgs(parser) + + # egs extraction options + parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width', + default = 20, + help="""Number of output labels in the sequence + used to train an LSTM. + Caution: if you double this you should halve + --trainer.samples-per-iter.""") + parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context', + default = 40, + help="""Number of left steps used in the estimation of LSTM + state before prediction of the first label""") + parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context', + default = 0, + help="""Number of right steps used in the estimation of BLSTM + state before prediction of the first label""") + parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', + default=20000, + help="""This is really the number of egs in each + archive. Each eg has 'chunk_width' frames in it-- + for chunk_width=20, this value (20k) is equivalent + to the 400k number that we use as a default in + regular DNN training.""") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', + default = 0.5, + help="""Momentum used in update computation. + Note: we implemented it in such a way that + it doesn't increase the effective learning rate.""") + parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value', + default = 0.99, + help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities") + parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold', + default = 0.15, + help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.") + parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int, dest='cv_minibatch_size', + default = 256, + help="Size of the minibatch to be used in diagnostic jobs (use smaller value for BLSTMs to control memory usage)") + + + + # RNN specific trainer options + parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', + default=100, + help="Number of sequences to be processed in parallel every minibatch" ) + parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps', + default=None, + help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." ) + + # General options + parser.add_argument("--nj", type=int, default=4, + help="Number of parallel jobs") + + parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction, + default = True, choices = ["true", "false"], + help="Train neural network using dense targets") + parser.add_argument("--feat-dir", type=str, required = True, + help="Directory with features used for training the neural network.") + parser.add_argument("--targets-scp", type=str, required = True, + help="Target for training neural network.") + parser.add_argument("--dir", type=str, required = True, + help="Directory to store the models and all other files.") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + [args, run_opts] = ProcessArgs(args) + + return [args, run_opts] + +def ProcessArgs(args): + # process the options + if args.chunk_width < 1: + raise Exception("--egs.chunk-width should have a minimum value of 1") + + if args.chunk_left_context < 0: + raise Exception("--egs.chunk-left-context should be positive") + + if args.chunk_right_context < 0: + raise Exception("--egs.chunk-right-context should be positive") + + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): + raise Exception("""This scripts expects {0} to exist and have a configs + directory which is the output of make_configs.py script""") + + # set the options corresponding to args.use_gpu + run_opts = train_lib.RunOpts() + if args.use_gpu: + if not CheckIfCudaCompiled(): + logger.warning(""" + You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. If you have + GPUs and have nvcc installed, go to src/ and do ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_queue_opt = "--gpu 1" + + else: + logger.warning(""" + Without using a GPU this will be very slow. nnet3 does not yet support multiple threads.""") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + run_opts.prior_gpu_opt = "--use-gpu=no" + run_opts.prior_queue_opt = "" + + run_opts.command = args.command + run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command + run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior + + return [args, run_opts] + +# args is a Namespace with the required parameters +def Train(args, run_opts): + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Set some variables. + feat_dim = GetFeatDim(args.feat_dir) + ivector_dim = GetIvectorDim(args.online_ivector_dir) + + # split the training data into parts for individual jobs + SplitData(args.feat_dir, args.nj) + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + variables = ParseGenericConfigVarsFile(var_file) + + # Set some variables. + + try: + model_left_context = variables['model_left_context'] + model_right_context = variables['model_right_context'] + num_hidden_layers = variables['num_hidden_layers'] + num_targets = int(variables['num_targets']) + add_lda = StrToBool(variables['add_lda']) + include_log_softmax = StrToBool(variables['include_log_softmax']) + objective_type = variables['objective_type'] + except KeyError as e: + raise Exception("KeyError {0}: Variables need to be defined in {1}".format( + str(e), '{0}/configs'.format(args.dir))) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + + if args.use_dense_targets: + if GetFeatDimFromScp(args.targets_scp) != num_targets: + raise Exception("Mismatch between num-targets provided to " + "script vs configs") + + if (args.stage <= -4): + logger.info("Initializing a basic network") + RunKaldiCommand(""" +{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw + """.format(command = run_opts.command, + dir = args.dir)) + + default_egs_dir = '{0}/egs'.format(args.dir) + + if args.use_dense_targets: + target_type = "dense" + compute_accuracy = False + else: + target_type = "sparse" + compute_accuracy = True if objective_type == "linear" else False + + if (args.stage <= -3) and args.egs_dir is None: + logger.info("Generating egs") + + GenerateEgsUsingTargets(args.feat_dir, args.targets_scp, default_egs_dir, + left_context, right_context, + args.chunk_width + left_context, + args.chunk_width + right_context, run_opts, + frames_per_eg = args.chunk_width, + srand = args.srand, + egs_opts = args.egs_opts, + cmvn_opts = args.cmvn_opts, + online_ivector_dir = args.online_ivector_dir, + samples_per_iter = args.samples_per_iter, + transform_dir = args.transform_dir, + stage = args.egs_stage, + target_type = target_type, + num_targets = num_targets) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + assert(args.chunk_width == frames_per_eg) + + if (args.num_jobs_final > num_archives): + raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory') + + # copy the properties of the egs to dir for + # use during decoding + CopyEgsPropertiesToExpDir(egs_dir, args.dir) + + if (add_lda and args.stage <= -2): + logger.info('Computing the preconditioning matrix for input features') + + ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs = args.max_lda_jobs, + rand_prune = args.rand_prune) + + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + PrepareInitialNetwork(args.dir, run_opts) + + + # set num_iters so that as close as possible, we process the data $num_epochs + # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, + # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_to_process = args.num_epochs * num_archives + num_archives_processed = 0 + num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final) + + num_iters_combine = VerifyIterations(num_iters, args.num_epochs, + num_hidden_layers, num_archives, + args.max_models_combine, args.add_layers_period, + args.num_jobs_final) + + learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + if args.num_bptt_steps is None: + num_bptt_steps = args.chunk_width + else: + num_bptt_steps = args.num_bptt_steps + + min_deriv_time = args.chunk_width - num_bptt_steps + + + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + + if args.stage <= iter: + model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter) + shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "Lstm*", "SigmoidComponent", args.shrink_threshold, get_raw_nnet_from_am = False) else 1 + logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) + + rnn_train_lib.TrainOneIteration( + dir = args.dir, + iter = iter, + srand = args.srand, + egs_dir = egs_dir, + num_jobs = current_num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), + shrinkage_value = shrinkage_value, + num_chunk_per_minibatch = args.num_chunk_per_minibatch, + num_hidden_layers = num_hidden_layers, + add_layers_period = args.add_layers_period, + left_context = left_context, + right_context = right_context, + min_deriv_time = min_deriv_time, + momentum = args.momentum, + max_param_change = args.max_param_change, + shuffle_buffer_size = args.shuffle_buffer_size, + cv_minibatch_size = args.cv_minibatch_size, + run_opts = run_opts, + compute_accuracy = compute_accuracy, + get_raw_nnet_from_am = False) + + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain conditions + RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, + args.preserve_model_interval, get_raw_nnet_from_am = False) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + message = report + subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) + SendMail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.raw") + CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, + chunk_width = args.chunk_width, get_raw_nnet_from_am = False, compute_accuracy = compute_accuracy) + + if include_log_softmax and args.stage <= num_iters + 1: + logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.") + avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir, + num_archives, args.prior_subset_size, run_opts, get_raw_nnet_from_am = False) + + if args.cleanup: + logger.info("Cleaning up the experiment directory {0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + CleanNnetDir(args.dir, num_iters, egs_dir, + preserve_model_interval = args.preserve_model_interval, + remove_egs = remove_egs, + get_raw_nnet_from_am = False) + + # do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + if args.email is not None: + SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + + report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") + report_handle.write(report) + report_handle.close() + + os.system("steps/info/nnet3_dir_info.pl " + args.dir) + +def Main(): + [args, run_opts] = GetArgs() + try: + Train(args, run_opts) + except Exception as e: + if args.email is not None: + message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) + SendMail(message, message, args.email) + traceback.print_exc() + raise e + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 3763fb26303..a5679800db6 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -2,6 +2,7 @@ # Copyright 2016 Vijayaditya Peddinti. +# 2016 Vimal Manohar # Apache 2.0. @@ -17,7 +18,9 @@ import traceback from nnet3_train_lib import * -nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py') +nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py') +rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py') +train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py') logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -43,16 +46,10 @@ def GetArgs(): at the non-linearities are below a threshold. 3. RNNs can also be trained with state preservation training """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler = 'resolve') - # feat options - parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', - default = None, action = NullstrToNoneAction, - help="""directory with the ivectors extracted in - an online fashion.""") - parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', - default = None, action = NullstrToNoneAction, - help="A string specifying '--norm-means' and '--norm-vars' values") + train_lib.AddCommonTrainArgs(parser) # egs extraction options parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width', @@ -69,58 +66,6 @@ def GetArgs(): default = 0, help="""Number of right steps used in the estimation of BLSTM state before prediction of the first label""") - parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', - default = None, action = NullstrToNoneAction, - help="""String to provide options directly to steps/nnet3/get_egs.sh script""") - parser.add_argument("--egs.dir", type=str, dest='egs_dir', - default = None, action = NullstrToNoneAction, - help="""Directory with egs. If specified this directory - will be used rather than extracting egs""") - parser.add_argument("--egs.stage", type=int, dest='egs_stage', - default = 0, help="Stage at which get_egs.sh should be restarted") - parser.add_argument("--egs.opts", type=str, dest='egs_opts', - default = None, action = NullstrToNoneAction, - help="""String to provide options directly to steps/nnet3/get_egs.sh script""") - - # trainer options - parser.add_argument("--trainer.srand", type=int, dest='srand', - default = 0, - help="Sets the random seed for model initialization and egs shuffling. " - "Warning: This random seed does not control all aspects of this experiment. " - "There might be other random seeds used in other stages of the experiment " - "like data preparation (e.g. volume perturbation).") - parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', - default = 8, - help="Number of epochs to train the model") - parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', - default = 20000, - help="Number of samples for computing priors") - parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', - default = 10, - help="The prior computation jobs are single threaded and run on the CPU") - parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', - default = 20, - help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges") - parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', - default = 5000, - help=""" Controls randomization of the samples on each - iteration. If 0 or a large value the randomization is - complete, but this will consume memory and cause spikes - in disk I/O. Smaller is easier on disk and memory but - less random. It's not a huge deal though, as samples - are anyway randomized right at the start. - (the point of this is to get data in different - minibatches on different iterations, since in the - preconditioning method, 2 samples in the same minibatch - can affect each others' gradients.""") - parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', - default=2, - help="The number of iterations between adding layers during layer-wise discriminative training.") - parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', - default=2.0, - help="""The maximum change in parameters allowed - per minibatch, measured in Frobenius norm over - the entire model""") parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', default=20000, help="""This is really the number of egs in each @@ -128,49 +73,8 @@ def GetArgs(): for chunk_width=20, this value (20k) is equivalent to the 400k number that we use as a default in regular DNN training.""") - parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', - default=4.0, - help="""Value used in preconditioning matrix estimation""") - parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', - default=10, - help="""Max number of jobs used for LDA stats accumulation""") - - # Realignment parameters - parser.add_argument("--trainer.realign.command", type=str, dest='realign_command', - default=None, action=NullstrToNoneAction, - help="""Command to be used with steps/nnet3/align.sh during realignment""") - parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs', - default=30, - help="Number of jobs to use for realignment") - parser.add_argument("--trainer.realign.times", type=str, dest='realign_times', - default=None, action=NullstrToNoneAction, - help="""A space seperated string of realignment - times. Values must be between 0 and 1 - e.g. '0.1 0.2 0.3' """) - - parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu', - default=True, action=StrToBoolAction, - choices = ["true", "false"], - help="If true, gpu is used with steps/nnet3/align.sh") # Parameters for the optimization - parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', - default = 0.0003, - help="Learning rate used during the initial iteration") - parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', - default = 0.00003, - help="Learning rate used during the final iteration") - parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', - default = 1, - help="Number of neural net jobs to run in parallel at the start of training") - parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', - default = 8, - help="Number of neural net jobs to run in parallel at the end of training") - parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', - default = 20, - help = """ The is the maximum number of models we give to the - final 'combine' stage, but these models will themselves - be averages of iteration-number ranges. """) parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', default = 0.5, help="""Momentum used in update computation. @@ -197,45 +101,10 @@ def GetArgs(): help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." ) # General options - parser.add_argument("--stage", type=int, default=-4, - help="Specifies the stage of the experiment to execution from") - parser.add_argument("--exit-stage", type=int, default=None, - help="If specified, training exits before running this stage") - parser.add_argument("--cmd", type=str, action = NullstrToNoneAction, - dest = "command", - help="""Specifies the script to launch jobs. - e.g. queue.pl for launching on SGE cluster - run.pl for launching on local machine - """, default = "queue.pl") - parser.add_argument("--use-gpu", type=str, action = StrToBoolAction, - choices = ["true", "false"], - help="Use GPU for training", default=True) - parser.add_argument("--cleanup", type=str, action = StrToBoolAction, - choices = ["true", "false"], - help="Clean up models after training", default=True) - parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', - default = True, action = StrToBoolAction, - choices = ["true", "false"], - help="""If true, remove egs after experiment""") - parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", - type=int, default=100, - help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.") - - parser.add_argument("--reporting.email", dest = "email", - type=str, default=None, action = NullstrToNoneAction, - help=""" Email-id to report about the progress of the experiment. - NOTE: It assumes the machine on which the script is being run can send - emails from command line via. mail program. The - Kaldi mailing list will not support this feature. - It might require local expertise to setup. """) - parser.add_argument("--reporting.interval", dest = "reporting_interval", - type=int, default=0.1, - help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") - parser.add_argument("--feat-dir", type=str, required = True, help="Directory with features used for training the neural network.") parser.add_argument("--lang", type=str, required = True, - help="Languade directory") + help="Language directory") parser.add_argument("--ali-dir", type=str, required = True, help="Directory with alignments used for training the neural network.") parser.add_argument("--dir", type=str, required = True, @@ -266,8 +135,9 @@ def ProcessArgs(args): if args.transform_dir is None: args.transform_dir = args.ali_dir + # set the options corresponding to args.use_gpu - run_opts = RunOpts() + run_opts = train_lib.RunOpts() if args.use_gpu: if not CheckIfCudaCompiled(): logger.warning(""" @@ -291,231 +161,12 @@ def ProcessArgs(args): run_opts.prior_gpu_opt = "--use-gpu=no" run_opts.prior_queue_opt = "" - if args.realign_use_gpu is True: - run_opts.realign_use_gpu = True - run_opts.realign_queue_opt = "--gpu 1" - else: - run_opts.realign_use_gpu = False - run_opts.realign_queue_opt = "" - - if args.realign_command is None: - run_opts.realign_command = args.command - else: - run_opts.realign_command = args.realign_command - run_opts.realign_num_jobs = args.realign_num_jobs - run_opts.command = args.command + run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior return [args, run_opts] -class StrToBoolAction(argparse.Action): - """ A custom action to convert bools from shell format i.e., true/false - to python format i.e., True/False """ - def __call__(self, parser, namespace, values, option_string=None): - if values == "true": - setattr(namespace, self.dest, True) - elif values == "false": - setattr(namespace, self.dest, False) - else: - raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) - -class NullstrToNoneAction(argparse.Action): - """ A custom action to convert empty strings passed by shell - to None in python. This is necessary as shell scripts print null strings - when a variable is not specified. We could use the more apt None - in python. """ - def __call__(self, parser, namespace, values, option_string=None): - if values.strip() == "": - setattr(namespace, self.dest, None) - else: - setattr(namespace, self.dest, values) - - -# a class to store run options -class RunOpts: - def __init__(self): - self.command = None - self.train_queue_opt = None - self.combine_queue_opt = None - self.prior_gpu_opt = None - self.prior_queue_opt = None - self.parallel_train_opts = None - self.realign_use_gpu = None - - -def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, - left_context, right_context, min_deriv_time, - momentum, max_param_change, - shuffle_buffer_size, num_chunk_per_minibatch, - cache_read_opt, run_opts): - # We cannot easily use a single parallel SGE job to do the main training, - # because the computation of which archive and which --frame option - # to use for each job is a little complex, so we spawn each one separately. - # this is no longer true for RNNs as we use do not use the --frame option - # but we use the same script for consistency with FF-DNN code - - context_opts="--left-context={0} --right-context={1}".format( - left_context, right_context) - processes = [] - for job in range(1,num_jobs+1): - k = num_archives_processed + job - 1 # k is a zero-based index that we will derive - # the other indexes from. - archive_index = (k % num_archives) + 1 # work out the 1-based archive index. - - cache_write_opt = "" - if job == 1: - # an option for writing cache (storing pairs of nnet-computations and - # computation-requests) during training. - cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1) - - process_handle = RunKaldiCommand(""" -{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ - nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \ - --print-interval=10 --momentum={momentum} \ - --max-param-change={max_param_change} \ - --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \ - "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ - {dir}/{next_iter}.{job}.raw - """.format(command = run_opts.command, - train_queue_opt = run_opts.train_queue_opt, - dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job, - parallel_train_opts = run_opts.parallel_train_opts, - cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt, - momentum = momentum, max_param_change = max_param_change, - min_deriv_time = min_deriv_time, - raw_model = raw_model_string, context_opts = context_opts, - egs_dir = egs_dir, archive_index = archive_index, - shuffle_buffer_size = shuffle_buffer_size, - num_chunk_per_minibatch = num_chunk_per_minibatch), - wait = False) - - processes.append(process_handle) - - all_success = True - for process in processes: - process.wait() - [stdout_value, stderr_value] = process.communicate() - print(stderr_value) - if process.returncode != 0: - all_success = False - - if not all_success: - open('{0}/.error'.format(dir), 'w').close() - raise Exception("There was error during training iteration {0}".format(iter)) - -def TrainOneIteration(dir, iter, srand, egs_dir, - num_jobs, num_archives_processed, num_archives, - learning_rate, shrinkage_value, num_chunk_per_minibatch, - num_hidden_layers, add_layers_period, - left_context, right_context, min_deriv_time, - momentum, max_param_change, shuffle_buffer_size, - cv_minibatch_size, run_opts): - # Set off jobs doing some diagnostics, in the background. - # Use the egs dir from the previous iteration for the diagnostics - logger.info("Training neural net (pass {0})".format(iter)) - - # check if different iterations use the same random seed - if os.path.exists('{0}/srand'.format(dir)): - try: - saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip()) - except IOError, ValueError: - raise Exception('Exception while reading the random seed for training') - if srand != saved_srand: - logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand)) - else: - f = open('{0}/srand'.format(dir), 'w') - f.write(str(srand)) - f.close() - - - ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size) - - if iter > 0: - ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size) - - # an option for writing cache (storing pairs of nnet-computations - # and computation-requests) during training. - cache_read_opt = "" - if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): - do_average = False # if we've just mixed up, don't do averaging but take the - # best. - cur_num_hidden_layers = 1 + iter / add_layers_period - config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) - raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file) - else: - do_average = True - if iter == 0: - do_average = False # on iteration 0, pick the best, don't average. - else: - cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) - raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) - - if do_average: - cur_num_chunk_per_minibatch = num_chunk_per_minibatch - else: - # on iteration zero or when we just added a layer, use a smaller minibatch - # size (and we will later choose the output of just one of the jobs): the - # model-averaging isn't always helpful when the model is changing too fast - # (i.e. it can worsen the objective function), and the smaller minibatch - # size will help to keep the update stable. - cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 - - try: - os.remove("{0}/.error".format(dir)) - except OSError: - pass - - TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, - raw_model_string, egs_dir, - left_context, right_context, min_deriv_time, - momentum, max_param_change, - shuffle_buffer_size, cur_num_chunk_per_minibatch, - cache_read_opt, run_opts) - [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) - nnets_list = [] - for n in models_to_average: - nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) - - if do_average: - # average the output of the different jobs. - RunKaldiCommand(""" -{command} {dir}/log/average.{iter}.log \ -nnet3-average {nnet_list} - \| \ -nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl - """.format(command = run_opts.command, - dir = dir, - iter = iter, - nnet_list = " ".join(nnets_list), - shrink = shrinkage_value, - new_iter = iter + 1)) - - else: - # choose the best model from different jobs - RunKaldiCommand(""" -{command} {dir}/log/select.{iter}.log \ - nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw {dir}/{iter}.mdl {dir}/{next_iter}.mdl - """.format(command = run_opts.command, - dir = dir, iter = iter, next_iter = iter + 1, - shrink = shrinkage_value, best_model_index = best_model)) - - try: - for i in range(1, num_jobs + 1): - os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) - except OSError: - raise Exception("Error while trying to delete the raw models") - - new_model = "{0}/{1}.mdl".format(dir, iter + 1) - - if not os.path.isfile(new_model): - raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) - elif os.stat(new_model).st_size == 0: - raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) - if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)): - os.remove("{0}/cache.{1}".format(dir, iter)) - - # args is a Namespace with the required parameters def Train(args, run_opts): arg_string = pprint.pformat(vars(args)) @@ -538,7 +189,21 @@ def Train(args, run_opts): config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) - [model_left_context, model_right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file) + variables = ParseGenericConfigVarsFile(var_file) + + # Set some variables. + + try: + model_left_context = variables['model_left_context'] + model_right_context = variables['model_right_context'] + num_hidden_layers = variables['num_hidden_layers'] + except KeyError as e: + raise Exception("KeyError {0}: Variables need to be defined in {1}".format( + str(e), '{0}/configs'.format(args.dir))) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' @@ -552,9 +217,6 @@ def Train(args, run_opts): """.format(command = run_opts.command, dir = args.dir)) - left_context = args.chunk_left_context + model_left_context - right_context = args.chunk_right_context + model_right_context - default_egs_dir = '{0}/egs'.format(args.dir) if (args.stage <= -3) and args.egs_dir is None: logger.info("Generating egs") @@ -616,15 +278,6 @@ def Train(args, run_opts): num_archives_to_process, args.initial_effective_lrate, args.final_effective_lrate) - realign_iters = [] - if args.realign_times is not None: - realign_iters = GetRealignIters(args.realign_times, - num_iters, - args.num_jobs_initial, - args.num_jobs_final) - print(realign_iters) - # egs_dir will be updated if there is realignment - cur_egs_dir=egs_dir if args.num_bptt_steps is None: num_bptt_steps = args.chunk_width @@ -642,41 +295,31 @@ def Train(args, run_opts): current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) if args.stage <= iter: - if iter in realign_iters: - logger.info("Re-aligning the data at iteration {0}".format(iter)) - prev_egs_dir=cur_egs_dir - cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter)) - new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter)) - Realign(args.dir, iter, args.feat_dir, args.lang, - prev_egs_dir, cur_egs_dir, - args.prior_subset_size, num_archives, run_opts, - transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir) - if args.cleanup and args.egs_dir is None: - RemoveEgs(prev_egs_dir) model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1 logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) - TrainOneIteration(dir = args.dir, - iter = iter, - srand = args.srand, - egs_dir = egs_dir, - num_jobs = current_num_jobs, - num_archives_processed = num_archives_processed, - num_archives = num_archives, - learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), - shrinkage_value = shrinkage_value, - num_chunk_per_minibatch = args.num_chunk_per_minibatch, - num_hidden_layers = num_hidden_layers, - add_layers_period = args.add_layers_period, - left_context = left_context, - right_context = right_context, - min_deriv_time = min_deriv_time, - momentum = args.momentum, - max_param_change= args.max_param_change, - shuffle_buffer_size = args.shuffle_buffer_size, - cv_minibatch_size = args.cv_minibatch_size, - run_opts = run_opts) + rnn_train_lib.TrainOneIteration( + dir = args.dir, + iter = iter, + srand = args.srand, + egs_dir = egs_dir, + num_jobs = current_num_jobs, + num_archives_processed = num_archives_processed, + num_archives = num_archives, + learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), + shrinkage_value = shrinkage_value, + num_chunk_per_minibatch = args.num_chunk_per_minibatch, + num_hidden_layers = num_hidden_layers, + add_layers_period = args.add_layers_period, + left_context = left_context, + right_context = right_context, + min_deriv_time = min_deriv_time, + momentum = args.momentum, + max_param_change = args.max_param_change, + shuffle_buffer_size = args.shuffle_buffer_size, + cv_minibatch_size = args.cv_minibatch_size, + run_opts = run_opts) if args.cleanup: # do a clean up everythin but the last 2 models, under certain conditions @@ -690,7 +333,7 @@ def Train(args, run_opts): [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) message = report subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) - sendMail(message, subject, args.email) + SendMail(message, subject, args.email) num_archives_processed = num_archives_processed + current_num_jobs @@ -717,21 +360,20 @@ def Train(args, run_opts): # delete it remove_egs = False - CleanNnetDir(args.dir, num_iters, cur_egs_dir, + CleanNnetDir(args.dir, num_iters, egs_dir, preserve_model_interval = args.preserve_model_interval, remove_egs = remove_egs) # do some reporting [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) if args.email is not None: - sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") report_handle.write(report) report_handle.close() - os.system("steps/info/nnet3_dir_info.sh " + args.dir) - + os.system("steps/info/nnet3_dir_info.pl " + args.dir) def Main(): [args, run_opts] = GetArgs() @@ -740,19 +382,9 @@ def Main(): except Exception as e: if args.email is not None: message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) - sendMail(message, message, args.email) + SendMail(message, message, args.email) traceback.print_exc() raise e -def SendMail(message, subject, email_id): - try: - subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format( - message = message, - subject = subject, - email = email_id), shell=True) - except Exception as e: - logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e))) - pass - if __name__ == "__main__": Main() diff --git a/src/base/kaldi-math-test.cc b/src/base/kaldi-math-test.cc index 52719cc4669..44ebfee01e0 100644 --- a/src/base/kaldi-math-test.cc +++ b/src/base/kaldi-math-test.cc @@ -128,6 +128,41 @@ void UnitTestRand() { KALDI_ASSERT(tot > (n * p * 0.8) && tot < (n * p * 1.2)); } } + { // test-1 RandIntDiscreteDist(). + int32 n = 10000, m = 10; + std::vector p(m, 0.0); + BaseFloat sum = 0.0; + // generate discrete probability distribution + for (int32 i = 0; i < m; i++) { + p[i] = RandUniform(); + if (RandInt(0,5) == 0) p[i] = 0; + sum += p[i]; + } + for (int32 i = 0; i < m; i++) + p[i] /= sum; + + std::vector rand_seq(n,0); + std::vector empirical_dist(m,0); + for (int32 i = 0; i < n; i++) { + rand_seq[i] = RandIntDiscreteDist(p); + // compute empirical distribution of generated sequence. + empirical_dist[rand_seq[i]] += 1.0/n; + } + + BaseFloat tmp = 0.0, kl_div = 0.0; + for (int32 i = 0; i < m; i++) { + if (p[i] < 0.0000001) { + KALDI_ASSERT(empirical_dist[i] <= 0.001); + KALDI_LOG << " p and q for i = " << i << " is " << p[i] << ", " << empirical_dist[i]; + } else { + if (empirical_dist[i] > 0.0) { + tmp = p[i]/empirical_dist[i]; + kl_div += p[i] * log(p[i]/empirical_dist[i]); + } + } + } + KALDI_ASSERT(kl_div < 0.001); + } { // test RandInt(). KALDI_ASSERT(RandInt(0, 3) >= 0 && RandInt(0, 3) <= 3); diff --git a/src/base/kaldi-math.cc b/src/base/kaldi-math.cc index 40300331731..c095d12e42e 100644 --- a/src/base/kaldi-math.cc +++ b/src/base/kaldi-math.cc @@ -24,6 +24,7 @@ #include #endif #include +#include namespace kaldi { // These routines are tested in matrix/matrix-test.cc @@ -74,6 +75,26 @@ RandomState::RandomState() { seed = Rand() + 27437; } +int32 RandIntDiscreteDist(const std::vector &prob, struct RandomState* state) { + BaseFloat prob_sum = std::accumulate(prob.begin(), prob.end(), 0.0); + KALDI_ASSERT(prob_sum <= 1.1 && prob_sum >= 0.99); // probability distribution sum should be one. + std::vector scaled_prob(prob); + int32 prob_size = prob.size(); + for (int32 i = 0; i < prob_size; i++) + scaled_prob[i] *= 1.0 / prob_sum; + std::vector cdf(prob_size); // cumulative probability distribution. + cdf[0] = scaled_prob[0]; + // if cdf(i) < random number < cdf(i+1), it returns i. + for (int32 i = 1; i < prob_size; i++) + cdf[i] = cdf[i-1] + scaled_prob[i]; + BaseFloat rand_num = RandUniform(state); + if (rand_num > 1.0) rand_num = 1.0; + std::vector::iterator low = std::lower_bound(cdf.begin(), cdf.end(), rand_num); + int32 ans = low - cdf.begin(); + KALDI_ASSERT(ans >=0 && ans < prob_size); + return ans; +} + bool WithProb(BaseFloat prob, struct RandomState* state) { KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0, // but we allow slightly larger values that could arise from roundoff in diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h index ac590a06a25..9020b0300d1 100644 --- a/src/base/kaldi-math.h +++ b/src/base/kaldi-math.h @@ -141,6 +141,12 @@ struct RandomState { // Returns a random integer between min and max inclusive. int32 RandInt(int32 min, int32 max, struct RandomState* state = NULL); +// Returns a random integer number according to a discrete probability distribution. +// It works based on sampling from a discrete distribution and +// it returns i with prob(i). +// prob must sume to one. +int32 RandIntDiscreteDist(const std::vector &prob, struct RandomState* = NULL); + // Returns true with probability "prob", bool WithProb(BaseFloat prob, struct RandomState* state = NULL); // with 0 <= prob <= 1 [we check this]. diff --git a/src/matrix/compressed-matrix.cc b/src/matrix/compressed-matrix.cc index 2ac2c544bc8..f7953bc407a 100644 --- a/src/matrix/compressed-matrix.cc +++ b/src/matrix/compressed-matrix.cc @@ -36,7 +36,17 @@ MatrixIndexT CompressedMatrix::DataSize(const GlobalHeader &header) { 2 * header.num_rows * header.num_cols; } } - +// scale all element of matrix by scaling floats +// in GlobalHeader with alpha. +void CompressedMatrix::Scale(float alpha) { + if (data_ != NULL) { + GlobalHeader *h = reinterpret_cast(data_); + // scale the floating point values in each PerColHolder + // and leave all integers the same. + h->min_value *= alpha; + h->range *= alpha; + } +} template void CompressedMatrix::CopyFromMat( diff --git a/src/matrix/compressed-matrix.h b/src/matrix/compressed-matrix.h index 603134ab800..4853b31b5e0 100644 --- a/src/matrix/compressed-matrix.h +++ b/src/matrix/compressed-matrix.h @@ -114,6 +114,10 @@ class CompressedMatrix { void Clear(); + /// scales all elements of matrix by alpha. + /// It scales the floating point values in GlobalHeader by alpha. + void Scale(float alpha); + friend class Matrix; friend class Matrix; private: @@ -163,7 +167,7 @@ class CompressedMatrix { static inline float CharToFloat(float p0, float p25, float p75, float p100, unsigned char value); - + void *data_; // first GlobalHeader, then PerColHeader (repeated), then // the byte data for each column (repeated). Note: don't intersperse // the byte data with the PerColHeaders, because of alignment issues. diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc index 687ac66ac46..b2269590d80 100644 --- a/src/matrix/matrix-lib-test.cc +++ b/src/matrix/matrix-lib-test.cc @@ -4227,7 +4227,22 @@ template static void UnitTestCompressedMatrix() { } } } - + { // Check Scale() method for compressedMatrix. + for (int32 t = 0; t < 10; t++) { + float alpha = 0.1; + MatrixIndexT num_rows = 4 + Rand() % 20, + num_cols = 10 + Rand() % 50; + Matrix M(num_rows, num_cols); + M.SetRandn(); + CompressedMatrix cmat(M); + Matrix scaled_comp_mat(num_rows, num_cols), + scaled_mat(M); + scaled_mat.Scale(alpha); + cmat.Scale(alpha); + cmat.CopyToMat(&scaled_comp_mat); + scaled_comp_mat.ApproxEqual(scaled_mat, 1.0e-04); + } + } if (n < 5) { // test I/O. bool binary = (n % 2 == 1); { diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc index 477d36f190a..77741d4cd09 100644 --- a/src/matrix/sparse-matrix.cc +++ b/src/matrix/sparse-matrix.cc @@ -52,6 +52,12 @@ Real SparseVector::Sum() const { return sum; } +template +void SparseVector::Scale(Real alpha) { + for (int32 i = 0; i < pairs_.size(); ++i) + pairs_[i].second *= alpha; +} + template template void SparseVector::CopyElementsToVec(VectorBase *vec) const { @@ -606,6 +612,13 @@ void SparseMatrix::AppendSparseMatrixRows( inputs->clear(); } +template +void SparseMatrix::Scale(Real alpha) { + MatrixIndexT num_rows = rows_.size(); + for (MatrixIndexT row = 0; row < num_rows; row++) + rows_[row].Scale(alpha); +} + template Real TraceMatSmat(const MatrixBase &A, const SparseMatrix &B, @@ -746,6 +759,16 @@ void GeneralMatrix::CopyToMat(MatrixBase *mat, } } +void GeneralMatrix::Scale(BaseFloat alpha) { + if (mat_.NumRows() !=0) { + mat_.Scale(alpha); + } else if (cmat_.NumRows() != 0) { + cmat_.Scale(alpha); + } else if (smat_.NumRows() != 0) { + smat_.Scale(alpha); + } + +} const SparseMatrix& GeneralMatrix::GetSparseMatrix() const { if (mat_.NumRows() != 0 || cmat_.NumRows() != 0) KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type."; diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h index 9f9362542e1..25ea83acb50 100644 --- a/src/matrix/sparse-matrix.h +++ b/src/matrix/sparse-matrix.h @@ -102,6 +102,9 @@ class SparseVector { void Write(std::ostream &os, bool binary) const; void Read(std::istream &os, bool binary); + + /// Scale all elements of sparse vector. + void Scale(Real alpha); private: MatrixIndexT dim_; @@ -195,6 +198,9 @@ class SparseMatrix { /// kUndefined behaves the same as kSetZero. void Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type = kSetZero); + + /// Scale all elements in sparse matrix. + void Scale(Real alpha); // Use the Matrix::CopyFromSmat() function to copy from this to Matrix. Also // see Matrix::AddSmat(). There is not very extensive functionality for @@ -283,6 +289,9 @@ class GeneralMatrix { /// Implemented in ../cudamatrix/cu-sparse-matrix.cc void AddToMat(BaseFloat alpha, CuMatrixBase *cu_mat, MatrixTransposeType trans = kNoTrans) const; + + /// scale each element of matrix with a scalar value. + void Scale(BaseFloat alpha); /// Assignment from regular matrix. GeneralMatrix &operator= (const MatrixBase &mat); diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h index 1df7cd1e78e..eb5e03702f6 100644 --- a/src/nnet3/nnet-example.h +++ b/src/nnet3/nnet-example.h @@ -101,6 +101,7 @@ struct NnetExample { /// Caution: this operator == is not very efficient. It's only used in /// testing code. bool operator == (const NnetExample &other) const { return io == other.io; } + }; diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h index 16e8333d5b1..83cc50c468a 100644 --- a/src/nnet3/nnet-nnet.h +++ b/src/nnet3/nnet-nnet.h @@ -186,7 +186,7 @@ class Nnet { /// returns index associated with this node name, or -1 if no such index. int32 GetNodeIndex(const std::string &node_name) const; - + /// returns index associated with this component name, or -1 if no such index. int32 GetComponentIndex(const std::string &node_name) const; diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 955e200d072..c9495f076db 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -56,10 +56,9 @@ bool IsSimpleNnet(const Nnet &nnet) { // "input" and everything checks out. if (NumInputNodes(nnet) == 1) return true; - // Otherwise, there should be 2 inputs and one + // Otherwise, there should be input node with name input and one // should be called "ivector". - return NumInputNodes(nnet) == 2 && - nnet.GetNodeIndex("ivector") != -1 && + return nnet.GetNodeIndex("ivector") != -1 && nnet.IsInputNode(nnet.GetNodeIndex("ivector")); } diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 9606bd5d5b7..41009189773 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -177,7 +177,6 @@ std::string NnetInfo(const Nnet &nnet); /// This function sets the dropout proportion in all dropout component to /// dropout_proportion value. void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet); - /// This function finds a list of components that are never used, and outputs /// the integer comopnent indexes (you can use these to index /// nnet.GetComponentNames() to get their names). diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc index efb51f51910..746ec83dd1c 100644 --- a/src/nnet3bin/nnet3-copy-egs.cc +++ b/src/nnet3bin/nnet3-copy-egs.cc @@ -2,6 +2,7 @@ // Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) // 2014 Vimal Manohar +// 2016 Pegah Ghahremani // See ../../COPYING for clarification regarding multiple authors // @@ -27,6 +28,34 @@ namespace kaldi { namespace nnet3 { +// rename io-name of eg w.r.t io_names list e.g. input/input-1,output/output-1 +// 'input' is renamed to input-1 and 'output' renamed to output-1. +void RenameIoNames(const std::string &io_names, + NnetExample *eg_modified) { + std::vector separated_io_names; + SplitStringToVector(io_names, ",", true, &separated_io_names); + int32 num_modified_io = separated_io_names.size(), + io_size = eg_modified->io.size(); + std::vector orig_io_list; + for (int32 io_ind = 0; io_ind < io_size; io_ind++) + orig_io_list.push_back(eg_modified->io[io_ind].name); + + for (int32 ind = 0; ind < num_modified_io; ind++) { + std::vector rename_io_name; + SplitStringToVector(separated_io_names[ind], "/", true, &rename_io_name); + // find the io in eg with specific name and rename it to new name. + + int32 rename_io_ind = + std::find(orig_io_list.begin(), orig_io_list.end(), rename_io_name[0]) - + orig_io_list.begin(); + + if (rename_io_ind >= io_size) + KALDI_ERR << "No io-node with name " << rename_io_name[0] + << "exists in eg."; + eg_modified->io[rename_io_ind].name = rename_io_name[1]; + } +} + // returns an integer randomly drawn with expected value "expected_count" // (will be either floor(expected_count) or ceil(expected_count)). int32 GetCount(double expected_count) { @@ -278,7 +307,9 @@ int main(int argc, char *argv[]) { // you can set frame to a number to select a single frame with a particular // offset, or to 'random' to select a random single frame. - std::string frame_str; + std::string frame_str, + weight_str = "", + output_str = ""; ParseOptions po(usage); po.Register("random", &random, "If true, will write frames to output " @@ -301,6 +332,16 @@ int main(int argc, char *argv[]) { "feature left-context that we output."); po.Register("right-context", &right_context, "Can be used to truncate the " "feature right-context that we output."); + po.Register("weights", &weight_str, + "Rspecifier maps the output posterior to each example" + "If provided, the supervision weight for output is scaled." + " Scaling supervision weight is the same as scaling to the derivative during training " + " in case of linear objective." + "The default is one, which means we are not applying per-example weights."); + po.Register("outputs", &output_str, + "Rspecifier maps example old output-name to new output-name in example." + " If provided, the NnetIo with name 'output' in each example " + " is renamed to new output name."); po.Read(argc, argv); @@ -315,6 +356,8 @@ int main(int argc, char *argv[]) { std::string examples_rspecifier = po.GetArg(1); SequentialNnetExampleReader example_reader(examples_rspecifier); + RandomAccessTokenReader output_reader(output_str); + RandomAccessBaseFloatReader egs_weight_reader(weight_str); int32 num_outputs = po.NumArgs() - 1; std::vector example_writers(num_outputs); @@ -322,7 +365,7 @@ int main(int argc, char *argv[]) { example_writers[i] = new NnetExampleWriter(po.GetArg(i+2)); - int64 num_read = 0, num_written = 0; + int64 num_read = 0, num_written = 0, num_err = 0; for (; !example_reader.Done(); example_reader.Next(), num_read++) { // count is normally 1; could be 0, or possibly >1. int32 count = GetCount(keep_proportion); @@ -332,12 +375,59 @@ int main(int argc, char *argv[]) { int32 index = (random ? Rand() : num_written) % num_outputs; if (frame_str == "" && left_context == -1 && right_context == -1 && frame_shift == 0) { - example_writers[index]->Write(key, eg); + NnetExample eg_modified = eg; + if (!weight_str.empty()) { + // scale the supervision weight for egs + if (!egs_weight_reader.HasKey(key)) { + KALDI_WARN << "No weight for example key " << key; + num_err++; + continue; + } + BaseFloat weight = egs_weight_reader.Value(key); + for (int32 i = 0; i < eg_modified.io.size(); i++) + if (eg_modified.io[i].name == "output") + eg_modified.io[i].features.Scale(weight); + } + if (!output_str.empty()) { + if (!output_reader.HasKey(key)) { + KALDI_WARN << "No new output-name for example key " << key; + num_err++; + continue; + } + std::string new_output_name = output_reader.Value(key); + // rename output io name to $new_output_name. + std::string rename_io_names = "output/" + new_output_name; + RenameIoNames(rename_io_names, &eg_modified); + } + example_writers[index]->Write(key, eg_modified); num_written++; } else { // the --frame option or context options were set. NnetExample eg_modified; if (SelectFromExample(eg, frame_str, left_context, right_context, frame_shift, &eg_modified)) { + if (!weight_str.empty()) { + // scale the supervision weight for egs + if (!egs_weight_reader.HasKey(key)) { + KALDI_WARN << "No weight for example key " << key; + num_err++; + continue; + } + int32 weight = egs_weight_reader.Value(key); + for (int32 i = 0; i < eg_modified.io.size(); i++) + if (eg_modified.io[i].name == "output") + eg_modified.io[i].features.Scale(weight); + } + if (!output_str.empty()) { + if (!output_reader.HasKey(key)) { + KALDI_WARN << "No new output-name for example key " << key; + num_err++; + continue; + } + std::string new_output_name = output_reader.Value(key); + // rename output io name to $new_output_name. + std::string rename_io_names = "output/" + new_output_name; + RenameIoNames(rename_io_names, &eg_modified); + } // this branch of the if statement will almost always be taken (should only // not be taken for shorter-than-normal egs from the end of a file. example_writers[index]->Write(key, eg_modified); diff --git a/src/nnet3bin/nnet3-copy.cc b/src/nnet3bin/nnet3-copy.cc index c419e0e0f91..e4a41933fff 100644 --- a/src/nnet3bin/nnet3-copy.cc +++ b/src/nnet3bin/nnet3-copy.cc @@ -44,6 +44,7 @@ int main(int argc, char *argv[]) { BaseFloat learning_rate = -1, dropout = 0.0; std::string nnet_config, edits_config, edits_str; + BaseFloat scale = 1.0; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); @@ -64,6 +65,8 @@ int main(int argc, char *argv[]) { "'--edits=remove-orphans'."); po.Register("set-dropout-proportion", &dropout, "Set dropout proportion " "in all DropoutComponent to this value."); + po.Register("scale", &scale, "The parameter matrices are scaled" + " by the specified value."); po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -85,6 +88,9 @@ int main(int argc, char *argv[]) { if (learning_rate >= 0) SetLearningRate(learning_rate, &nnet); + if (scale != 1.0) + ScaleNnet(scale, &nnet); + if (dropout > 0) SetDropoutProportion(dropout, &nnet); diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc index 8627671f53a..f214f1d60ea 100644 --- a/src/nnet3bin/nnet3-merge-egs.cc +++ b/src/nnet3bin/nnet3-merge-egs.cc @@ -26,13 +26,15 @@ namespace kaldi { namespace nnet3 { -// returns the number of indexes/frames in the NnetIo named "output" in the eg, +// returns the number of indexes/frames in the NnetIo with output +// including string "output" as part of its name in the eg, // or crashes if it is not there. +// e.g. output-0, output-xent int32 NumOutputIndexes(const NnetExample &eg) { for (size_t i = 0; i < eg.io.size(); i++) - if (eg.io[i].name == "output") + if (eg.io[i].name.find("output") != std::string::npos) return eg.io[i].indexes.size(); - KALDI_ERR << "No output named 'output' in the eg."; + KALDI_ERR << "No output name with string 'output' as part of its name exists in the eg."; return 0; // Suppress compiler warning. }