diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh index cb5756188a4..cebb2b84f16 120000 --- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh +++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1f.sh \ No newline at end of file +tuning/run_tdnn_1g.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh new file mode 100755 index 00000000000..e234b847aa7 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh @@ -0,0 +1,311 @@ +#!/bin/bash + +# 1g is as 1f but adding dropout (well, something like dropout-- the mask +# is shared across time and it's continuous rather than zero-one), increasing +# the hidden dimension, and training for more epochs. + +# local/chain/compare_wer.sh --online exp/chain/tdnn1f_sp exp/chain/tdnn1g_sp +# System tdnn1f_sp tdnn1g_sp +#WER dev_clean_2 (tgsmall) 14.21 13.76 +# [online:] 14.18 13.72 +#WER dev_clean_2 (tglarge) 10.32 9.65 +# [online:] 10.25 9.85 +# Final train prob -0.0507 -0.0453 +# Final valid prob -0.0912 -0.0892 +# Final train prob (xent) -1.3550 -1.1694 +# Final valid prob (xent) -1.6018 -1.4486 +# Num-params 4205322 6227338 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1{f,g}_sp +# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 1) xent:train/valid[10,16,final]=(-1.61,-1.41,-1.36/-1.82,-1.66,-1.60) logprob:train/valid[10,16,final]=(-0.067,-0.057,-0.051/-0.106,-0.097,-0.091) +# exp/chain/tdnn1g_sp: num-iters=25 nj=2..5 num-params=6.2M dim=40+100->2309 combine=-0.054->-0.053 (over 2) xent:train/valid[15,24,final]=(-1.49,-1.22,-1.17/-1.75,-1.51,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.050,-0.045/-0.106,-0.096,-0.089) + + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1g # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05 dropout-per-dim-continuous=true" + output_opts="l2-regularize=0.02 bottleneck-dim=192" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=15 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l h). +# This run failed due to instability. + +# 7m25i is as 7m25g but with dropout-per-dim-continuous=true. +# +# 7m25g is as 7m25f but with dim=1536 for the subsampled layers (more like 7m25d than 7m25e). + +# 7m25f is as 7m25e but with a dropout schedule borrowed from the LSTM experiments. +# +# 7m25e is as 7m25d but reverting dims back from 1536 to 1280. + +# 7m25d is as 7m25c but reverting to sharing the linear layer before the +# prefinal layer (more like 7m23t{,2}). Also changing one splicing input +# to be from a layer that wasn't otherwise used as splicing input. + +# 7m25c is as 7m25b but for the layers after we start using 3's not 1's, +# increasing dim from 1280 to 1536. +# 7m25b is as 7m25a but with slightly different skip connections, +# so all layers are the sources of skip connections. (Also see 7m23u, although +# that experiment didn't give clear results). +# 7m25a is as 7m23t but with some renamings of layers to make it more +# understandable, and changing how the last layer is done (there's now a little +# bit less sharing). + +# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim. +# Differernce vs. 23r is unclear (maybe slightly worse), but it +# seems slightly better than 23h, and it's nice that it has fewer parameters. + + +# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp +# System tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp +# WER on train_dev(tg) 12.28 11.95 12.18 +# WER on train_dev(fg) 11.21 10.97 11.12 +# WER on eval2000(tg) 15.0 15.0 14.9 +# WER on eval2000(fg) 13.5 13.6 13.5 +# WER on rt03(tg) 18.5 18.4 18.4 +# WER on rt03(fg) 16.1 15.9 16.2 +# Final train prob -0.083 -0.076 -0.077 +# Final valid prob -0.097 -0.091 -0.093 +# Final train prob (xent) -1.036 -0.978 -0.994 +# Final valid prob (xent) -1.0629 -1.0026 -1.0194 +# Num-parameters 23513380 23513380 20111396 + +# 7m23r is as 7m23h but with 6 epochs instead of 4. See also 7m23p, which +# had 3 epochs. + +# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'. +# Seems slightly better. The comparison below includes our old TDNN+LSTM result +# with dropout, to show that we're doing better than that now. + +# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp +# System tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp +# WER on train_dev(tg) 12.33 12.38 12.28 +# WER on train_dev(fg) 11.42 11.44 11.21 +# WER on eval2000(tg) 15.2 15.1 15.0 +# WER on eval2000(fg) 13.8 13.6 13.5 +# WER on rt03(tg) 18.6 18.4 18.5 +# WER on rt03(fg) 16.3 16.1 16.1 +# Final train prob -0.082 -0.084 -0.083 +# Final valid prob -0.099 -0.098 -0.097 +# Final train prob (xent) -0.959 -1.049 -1.036 +# Final valid prob (xent) -1.0305 -1.0661 -1.0629 +# Num-parameters 39558436 23120164 23513380 +# +# 7m23b2 is as 7m23b but fixing an issue at the last layers. +# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the +# splicing in 2 stages. Interestingly, objf is not better than 23, but +# WER is slightly better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp +# System tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp +# WER on train_dev(tg) 12.55 12.23 12.38 +# WER on train_dev(fg) 11.52 11.29 11.44 +# WER on eval2000(tg) 15.2 15.2 15.1 +# WER on eval2000(fg) 13.6 13.7 13.6 +# WER on rt03(tg) 18.6 18.7 18.4 +# WER on rt03(fg) 16.2 16.3 16.1 +# Final train prob -0.089 -0.083 -0.084 +# Final valid prob -0.101 -0.097 -0.098 +# Final train prob (xent) -1.080 -1.025 -1.049 +# Final valid prob (xent) -1.0990 -1.0548 -1.0661 +# Num-parameters 21055012 23120164 23120164 + + +# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and +# reducing the dim of the linear components... it's basically an attempt to +# reverse the factorization to have the splicing at a different point. +# + +# 7m19m is as 7m19l but with more skip connections +# Hm-- seems better than 19h. +# +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# System tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# WER on train_dev(tg) 12.61 12.72 12.55 +# WER on train_dev(fg) 11.72 11.62 11.52 +# WER on eval2000(tg) 15.4 15.4 15.2 +# WER on eval2000(fg) 13.7 13.8 13.6 +# WER on rt03(tg) 18.9 18.9 18.6 +# WER on rt03(fg) 16.3 16.4 16.2 +# Final train prob -0.091 -0.091 -0.089 +# Final valid prob -0.102 -0.103 -0.101 +# Final train prob (xent) -1.098 -1.095 -1.080 +# Final valid prob (xent) -1.1031 -1.1191 -1.0990 +# Num-parameters 21055012 20268580 21055012 +# +# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before +# doing the Append... doing this by inserting a linear-component between +# pairs of relu-batchnorm-layers. +# A little worse. +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp +# System tdnn7m19h_sp tdnn7m19l_sp +# WER on train_dev(tg) 12.65 12.72 +# WER on train_dev(fg) 11.57 11.62 +# WER on eval2000(tg) 15.3 15.4 +# WER on eval2000(fg) 13.7 13.8 +# WER on rt03(tg) 18.8 18.9 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.091 -0.091 +# Final valid prob -0.102 -0.103 +# Final train prob (xent) -1.091 -1.095 +# Final valid prob (xent) -1.1064 -1.1191 +# Num-parameters 21055012 20268580 + + +# 7m19h is as 7m19e but with an extra bypass connection. A bit better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp +# System tdnn7m19e_sp tdnn7m19h_sp +# WER on train_dev(tg) 12.75 12.65 +# WER on train_dev(fg) 11.77 11.57 +# WER on eval2000(tg) 15.5 15.3 +# WER on eval2000(fg) 14.0 13.7 +# WER on rt03(tg) 18.9 18.8 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.092 -0.091 +# Final valid prob -0.102 -0.102 +# Final train prob (xent) -1.094 -1.091 +# Final valid prob (xent) -1.1095 -1.1064 +# Num-parameters 20760100 21055012 + +# 7m19e is as 7m19c,d but with dims increased to 1536. Better! + +# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# System tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# WER on train_dev(tg) 13.77 12.86 13.01 12.75 +# WER on train_dev(fg) 12.65 11.82 12.02 11.77 +# WER on eval2000(tg) 16.1 15.4 15.7 15.5 +# WER on eval2000(fg) 14.3 13.8 14.0 14.0 +# WER on rt03(tg) 19.9 19.1 19.2 18.9 +# WER on rt03(fg) 17.4 16.6 16.7 16.4 +# Final train prob -0.111 -0.094 -0.096 -0.092 +# Final valid prob -0.120 -0.103 -0.105 -0.102 +# Final train prob (xent) -1.314 -1.117 -1.144 -1.094 +# Final valid prob (xent) -1.3247 -1.1223 -1.1478 -1.1095 +# Num-parameters 13361700 17824036 14887972 20760100 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# System tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# WER on train_dev(tg) 13.37 13.09 12.93 12.86 13.01 +# WER on train_dev(fg) 12.47 12.12 11.87 11.82 12.02 +# WER on eval2000(tg) 15.8 15.8 15.6 15.4 15.7 +# WER on eval2000(fg) 14.3 14.3 14.0 13.8 14.0 +# WER on rt03(tg) 15.1 14.8 14.9 14.8 14.9 +# WER on rt03(fg) 12.7 12.4 12.5 12.5 12.6 +# Final train prob -0.099 -0.096 -0.096 -0.094 -0.096 +# Final valid prob -0.110 -0.106 -0.106 -0.103 -0.105 +# Final train prob (xent) -1.302 -1.198 -1.188 -1.117 -1.144 +# Final valid prob (xent) -1.3184 -1.2070 -1.1980 -1.1223 -1.1478 +# Num-parameters 14216996 15528996 16512036 17824036 14887972 + +# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). +# Seems about 0.1% better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# WER on train_dev(tg) 13.09 12.93 12.86 +# WER on train_dev(fg) 12.12 11.87 11.82 +# WER on eval2000(tg) 15.8 15.6 15.4 +# WER on eval2000(fg) 14.3 14.0 13.8 +# WER on rt03(tg) 14.8 14.9 14.8 +# WER on rt03(fg) 12.4 12.5 12.5 +# Final train prob -0.096 -0.096 -0.094 +# Final valid prob -0.106 -0.106 -0.103 +# Final train prob (xent) -1.198 -1.188 -1.117 +# Final valid prob (xent) -1.2070 -1.1980 -1.1223 +# Num-parameters 15528996 16512036 17824036 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 14.8 14.9 +# WER on rt03(fg) 12.4 12.5 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m25l +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1536 input=Append(0, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1536 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1536 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1536 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536 + output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 8 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh index be8d39de80b..e3d13ac1f65 100755 --- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh @@ -4,7 +4,7 @@ # end, and no chain l2-regularize #[note: was 1e12e.] -# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1e12e_sp +# local/chain/compare_wer.sh exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp # System tdnn1e10_sp tdnn1e12e_sp #WER dev93 (tgpr) 7.29 7.20 #WER dev93 (tg) 7.08 6.81 diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index a3dfa89cf0e..eda1461a2ab 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -689,6 +689,9 @@ def set_default_configs(self): # 'dropout' in the name 'dropout-per-dim': False, # if dropout-per-dim=true, the dropout # mask is shared across time. + 'dropout-per-dim-continuous': False, # if you set this, it's + # like dropout-per-dim but with a + # continuous-valued (not zero-one) mask. 'add-log-stddev': False, # the following are not really inspected by this level of # code, just passed through (but not if left at ''). @@ -864,32 +867,19 @@ def _add_components(self, input_desc, input_dim, nonlinearities): ''.format(self.name, nonlinearity, output_dim)) elif nonlinearity == 'dropout': - if not self.config['dropout-per-dim']: + if not (self.config['dropout-per-dim'] or + self.config['dropout-per-dim-continuous']): line = ('component name={0}.{1} type=DropoutComponent ' 'dim={2} dropout-proportion={3}'.format( self.name, nonlinearity, output_dim, self.config['dropout-proportion'])) else: - line = ('component name={0}.dropout_mask type=DropoutMaskComponent ' - 'output-dim={1} dropout-proportion={2}'.format( - self.name, output_dim, self.config['dropout-proportion'])) - configs.append(line) - # note: the input to the dropout_mask component is never used, it's - # just syntactically required. - line = ('component-node name={0}.dropout_mask component={0}.dropout_mask ' - 'input={1}'.format(self.name, cur_node)) - configs.append(line) - line = ('component name={0}.dropout type=ElementwiseProductComponent ' - 'input-dim={1} output-dim={2} '.format( - self.name, 2 * output_dim, output_dim)) - configs.append(line) - line = ('component-node name={0}.dropout component={0}.dropout ' - 'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))' - ''.format(self.name, cur_node)) - configs.append(line) - cur_node = '{0}.dropout'.format(self.name) - continue + continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else '' + line = ('component name={0}.dropout type=GeneralDropoutComponent ' + 'dim={1} dropout-proportion={2} {3}'.format( + self.name, output_dim, self.config['dropout-proportion'], + continuous_opt)) else: raise RuntimeError("Unknown nonlinearity type: {0}" .format(nonlinearity)) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 87edd661a6f..6b8b1834749 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -221,7 +221,8 @@ def process_args(args): if (not os.path.exists(args.dir+"/configs") and (args.input_model is None or not os.path.exists(args.input_model))): raise Exception("Either --trainer.input-model option should be supplied, " - "and exist; or the {0}/configs directory should exist.") + "and exist; or the {0}/configs directory should exist." + "".format(args.dir)) if args.transform_dir is None: args.transform_dir = args.lat_dir diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 620ea873eb7..c936061de26 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -33,10 +33,9 @@ DenominatorComputation::DenominatorComputation( den_graph_(den_graph), num_sequences_(num_sequences), frames_per_sequence_(nnet_output.NumRows() / num_sequences_), - exp_nnet_output_transposed_(nnet_output, kTrans), nnet_output_deriv_transposed_( - exp_nnet_output_transposed_.NumRows(), - std::min(exp_nnet_output_transposed_.NumCols(), + nnet_output.NumCols(), + std::min(nnet_output.NumRows(), static_cast(kMaxDerivTimeSteps) * num_sequences_)), alpha_(frames_per_sequence_ + 1, @@ -57,6 +56,14 @@ DenominatorComputation::DenominatorComputation( num_sequences_).SetZero(); KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0); + // the kStrideEqualNumCols argument means we'll allocate a contiguous block of + // memory for this; it is added to ensure that the same block of memory + // (cached in the allocator) can be used for xent_output_deriv when allocated + // from chain-training.cc. + exp_nnet_output_transposed_.Resize(nnet_output.NumCols(), + nnet_output.NumRows(), + kUndefined, kStrideEqualNumCols); + exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans); exp_nnet_output_transposed_.ApplyExp(); } diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index bf61bed67f0..f4b0d110373 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -52,8 +52,15 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, nnet_output_deriv); } - if (xent_output_deriv != NULL) - xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols()); + if (xent_output_deriv != NULL) { + // the reason for kStrideEqualNumCols is so that we can share the memory + // block with the memory that was used for exp_nnet_output_transposed_ from + // chain-denominator.cc, which has just been freed; it also uses the + // kStrideEqualNumCols arg (its shape is the transpose of this matrix's + // shape). + xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols(), + kSetZero, kStrideEqualNumCols); + } { diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 8ab03c7e14e..f2926ddc2f1 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -143,6 +143,12 @@ void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst, void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); +void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst, + const double* src, const MatrixIndexT_cuda* reorder, + MatrixDim dst_dim, int src_stride); +void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, + const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, + int src_stride); void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst, const double* const * src, MatrixDim dst_dim); void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index ae7e25b716d..50dd3d1d0ca 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -1984,6 +1984,23 @@ static void _add_rows(Real alpha, Real* dst, const Real *src, } } +template +__global__ +static void _mul_rows(Real* dst, const Real *src, + const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, + int src_stride) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < dst_dim.cols && j < dst_dim.rows) { + int dst_index = j * dst_dim.stride + i; + if (reorder[j] >= 0) { + int src_index = reorder[j] * src_stride + i; + dst[dst_index] *= src[src_index]; + } + } +} + + template __global__ static void _add_rows(Real alpha, Real* dst, const Real * const *src, @@ -3764,6 +3781,12 @@ void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src, _add_rows<<>>(alpha, dst, src, reorder, dst_dim, src_stride); } +void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, + const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, + int src_stride) { + _mul_rows<<>>(dst, src, reorder, dst_dim, src_stride); +} + void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* const * src, MatrixDim dst_dim) { _add_rows<<>>(alpha, dst, src, dst_dim); @@ -4454,6 +4477,12 @@ void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst, _add_rows<<>>(alpha, dst, src, reorder, dst_dim, src_stride); } +void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst, + const double* src, const MatrixIndexT_cuda* reorder, + MatrixDim dst_dim, int src_stride) { + _mul_rows<<>>(dst, src, reorder, dst_dim, src_stride); +} + void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst, const double* const * src, MatrixDim dst_dim) { _add_rows<<>>(alpha, dst, src, dst_dim); diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 3518e0c71ed..fe706815a44 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -221,20 +221,30 @@ inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst, const double* const * src, MatrixDim dst_dim) { cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim); } +inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, + const float* const * src, MatrixDim dst_dim) { + cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim); +} inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride); } -inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, - const float* const * src, MatrixDim dst_dim) { - cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim); -} inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride); } +inline void cuda_mul_rows(dim3 Gr, dim3 Bl, double* dst, + const double* src, const MatrixIndexT_cuda* reorder, + MatrixDim dst_dim, int src_stride) { + cudaD_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride); +} +inline void cuda_mul_rows(dim3 Gr, dim3 Bl, float* dst, + const float* src, const MatrixIndexT_cuda* reorder, + MatrixDim dst_dim, int src_stride) { + cudaF_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride); +} inline void cuda_add_smat(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim, double alpha, const int* smat_row_ptr, const int* smat_col_idx, const double* smat_val) { diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 909e5552a35..33db8b3e625 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -534,6 +534,42 @@ static void UnitTestCuMatrixAddRows() { } +template +static void UnitTestCuMatrixMulRows() { + for (int32 p = 0; p < 2; p++) { + MatrixIndexT num_rows1 = 10 + Rand() % 10, + num_rows2 = 10 + Rand() % 10, + num_cols = 10 + Rand() % 10; + CuMatrix M(num_rows1, num_cols); + M.SetRandn(); + + CuMatrix N1(num_rows2, num_cols), + O(num_rows2, num_cols); + std::vector reorder(num_rows2); + std::vector reorder_src(num_rows2, NULL); + for (int32 i = 0; i < num_rows2; i++) { + reorder[i] = -1 + (Rand() % (num_rows1 + 1)); + if (reorder[i] != -1) + reorder_src[i] = M.RowData(reorder[i]); + } + + CuArray reorder_cuda(reorder); + N1.MulRows(M, reorder_cuda); + + for (int32 i = 0; i < num_rows2; i++) { + if (reorder[i] != -1) { + CuSubVector O_row(O, i), + M_row(M, reorder[i]); + O_row.MulElements(M_row); + } + } + + AssertEqual(N1, O); + } +} + + + template static void UnitTestCuMatrixAddToRows() { for (int32 p = 0; p < 2; p++) { @@ -2914,6 +2950,7 @@ template void CudaMatrixUnitTest() { UnitTestCuMatrixCopyColsFromVec(); UnitTestCuMatrixCopyToRows(); UnitTestCuMatrixAddRows(); + UnitTestCuMatrixMulRows(); UnitTestCuMatrixAddToRows(); UnitTestCuMatrixAddRowRanges(); UnitTestCuMatrixAddTpMat(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 813c5e75d14..34290561cc5 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -2722,6 +2722,41 @@ void CuMatrixBase::AddRows(Real alpha, } } +template +void CuMatrixBase::MulRows(const CuMatrixBase &src, + const CuArrayBase &indexes) { + if (NumRows() == 0) return; + KALDI_ASSERT(static_cast(indexes.Dim()) == NumRows()); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + KALDI_ASSERT(src.NumCols() == NumCols()); + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_mul_rows(dimGrid, dimBlock, + data_, src.Data(), indexes.Data(), Dim(), src.Stride()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + MatrixBase &this_mat(Mat()); + const MatrixBase &src_mat(src.Mat()); + int32 num_rows = NumRows(); + const MatrixIndexT *index_ptr = indexes.Data(); + for (int32 r = 0; r < num_rows; r++) { + int32 src_r = index_ptr[r]; + if (src_r < 0) + continue; + SubVector this_row(this_mat, r), + src_row(src_mat, src_r); + this_row.MulElements(src_row); + } + } +} + + template void CuMatrixBase::AddRows(Real alpha, const CuArrayBase &src) { diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 7c3a2a2e11f..86c50cfc485 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -139,6 +139,15 @@ class CuMatrixBase { const CuMatrixBase &src, const CuArrayBase &indexes); + + /// Does for each row r, this.Row(r) *= alpha * src.row(indexes[r]), + /// where '*=' is elementwise multiplication. + /// If indexes[r] < 0, does not add anything. + /// src.NumCols() must equal this.NumCols() + void MulRows(const CuMatrixBase &src, + const CuArrayBase &indexes); + + /// Does for each row r, this.Row(r) += alpha * src[r], /// treating src[r] as the beginning of a region of memory representing /// a vector of floats, of the same length as this.NumCols(). diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 2080c60077b..844fb82d32a 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -92,18 +92,101 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { num_minibatches_processed_++; } +// This object exists to help avoid memory fragmentation: it allocates, +// but does not use, the exact sizes of memory that are going to be needed +// in ComputeChainObjfAndDeriv(). +class ChainTrainerMemoryHolder { + public: + ChainTrainerMemoryHolder(const Nnet &nnet, + int32 num_den_graph_states, + const NnetChainExample &eg); + private: + CuMatrix nnet_output_deriv_; + CuMatrix xent_output_deriv_; + CuMatrix beta_; + CuMatrix alpha_; + +}; + +ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet, + int32 den_graph_states, + const NnetChainExample &eg) { + + std::vector::const_iterator iter = eg.outputs.begin(), + end = eg.outputs.end(); + + int32 max_rows = 0, + max_cols = 0; + + size_t max_frames_per_sequence = 0, + max_sequence_size = 0, + max_alpha_matrix_size = 0; + + for (; iter != end; ++iter) { + // there will normally be just one of these things; we'll normally loop once. + const NnetChainSupervision &sup = *iter; + + int32 output_rows = sup.supervision.num_sequences * sup.supervision.frames_per_sequence; + int32 output_cols = nnet.OutputDim("output"); + + size_t curr_frames_per_sequence = output_rows / sup.supervision.num_sequences + 1; + size_t den_graph_size = den_graph_states + 1; + size_t curr_sequence_size = den_graph_size * sup.supervision.num_sequences; + size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size; + + if (curr_alpha_matrix_size > max_alpha_matrix_size) { + max_alpha_matrix_size = curr_alpha_matrix_size; + max_frames_per_sequence = curr_frames_per_sequence; + max_sequence_size = curr_sequence_size; + } + + size_t matrix_size = output_rows * output_cols; + if (matrix_size > (max_rows * max_cols)) { + max_rows = output_rows; + max_cols = output_cols; + } + } + + // the sequence of resizes is in a specific order (bigger to smaller) + // so that the cudaMalloc won't trash the memory it has already + // alloc'd in the previous iterations + alpha_.Resize(max_frames_per_sequence, + max_sequence_size, + kUndefined); + + + nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined); + // note: the same block of memory can be used for xent_output_deriv_ as is + // used for exp_nnet_output_transposed_ in chain-training.cc. + xent_output_deriv_.Resize(max_rows, max_cols, + kUndefined, kStrideEqualNumCols); + + beta_.Resize(2, max_sequence_size, kUndefined); +} + void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, const NnetComputation &computation) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; // note: because we give the 1st arg (nnet_) as a pointer to the // constructor of 'computer', it will use that copy of the nnet to - // store stats. This is mainly important for memory-norm. + // store stats. NnetComputer computer(nnet_config.compute_config, computation, nnet_, delta_nnet_); + + // reserve the memory needed in ProcessOutputs (before memory gets fragmented + // by the call to computer.Run(). + ChainTrainerMemoryHolder *memory_holder = + new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg); + // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.inputs); computer.Run(); + // 'this->ProcessOutputs()' is going to need the same sizes as are stored in + // 'memory_holder'. + delete memory_holder; + + // Probably could be merged in a single call PreallocateChainTrainerMemory(*nnet_, eg) ? this->ProcessOutputs(false, eg, &computer); computer.Run(); @@ -140,7 +223,7 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, const NnetTrainerOptions &nnet_config = opts_.nnet_config; // note: because we give the 1st arg (nnet_) as a pointer to the // constructor of 'computer', it will use that copy of the nnet to - // store stats. This is mainly important for memory-norm. + // store stats. NnetComputer computer(nnet_config.compute_config, computation, nnet_, delta_nnet_); // give the inputs to the computer object. diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index ce4bbd0940a..c73f3fb921d 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -66,6 +66,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute ans = new TimeHeightConvolutionComponent::PrecomputedIndexes(); } else if (cpi_type == "RestrictedAttentionComponentPrecomputedIndexes") { ans = new RestrictedAttentionComponent::PrecomputedIndexes(); + } else if (cpi_type == "GeneralDropoutComponentPrecomputedIndexes") { + ans = new GeneralDropoutComponentPrecomputedIndexes(); } if (ans != NULL) { KALDI_ASSERT(cpi_type == ans->Type()); @@ -158,6 +160,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new DropoutComponent(); } else if (component_type == "DropoutMaskComponent") { ans = new DropoutMaskComponent(); + } else if (component_type == "GeneralDropoutComponent") { + ans = new GeneralDropoutComponent(); } else if (component_type == "BackpropTruncationComponent") { ans = new BackpropTruncationComponent(); } else if (component_type == "LstmNonlinearityComponent") { diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index c34d550d681..79a1f1a5602 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -588,7 +588,7 @@ class UpdatableComponent: public Component { self-repair mechanism is activated. -1000 is a special value which will cause a component-specific default to be used. - block-dim Defaults to dim, but may be any nonzero divisor of dim. It affects the + block-dim Defaults to dim, but may be any divisor of dim. It affects the self-repair, which will be done while treating the input/output as repeating blocks of size 'block-dim' (e.g. blocks of filters). It allows us to do self-repair on the filter level in CNNs. diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc index d7595378c1f..39bd156e360 100644 --- a/src/nnet3/nnet-component-test.cc +++ b/src/nnet3/nnet-component-test.cc @@ -41,7 +41,7 @@ bool CheckStringsApproxEqual(const std::string &a, int32 tolerance = 3) { if (!StringsApproxEqual(a, b, tolerance)) { KALDI_WARN << "Strings differ: " << a - << "\vs.\n" << b; + << "\nvs.\n" << b; return false; } else { return true; diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc index 98aed592a62..bb0e7c917fc 100644 --- a/src/nnet3/nnet-computation.cc +++ b/src/nnet3/nnet-computation.cc @@ -636,8 +636,8 @@ static void PrintCommand(std::ostream &os_out, KALDI_ASSERT(c.arg2 == kCompressedMatrixInt16); compressed_matrix_type = "uint16"; } - os << "CompressMatrix(" << submatrix_strings[c.arg1] - << ", " << range << ", " << compressed_matrix_type << ", " + os << "CompressMatrix(" << submatrix_strings[c.arg1] << ", " + << range << ", " << compressed_matrix_type << ", " << truncate << ")\n"; break; } diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index 19eecdda72b..cae6f41f5f2 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -399,8 +399,8 @@ void NnetComputer::ExecuteCommand() { compressed_matrices_[m]->CopyFromMat(matrices_[m]); matrices_[m].Resize(0, 0); } - break; #endif + break; case kDecompressMatrix: #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index dd6e950a7d1..2720fbbd0bd 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const { stream << Type() << ", output-dim=" << output_dim_ << ", dropout-proportion=" << dropout_proportion_; + if (continuous_) + stream << ", continuous=true"; return stream.str(); } DropoutMaskComponent::DropoutMaskComponent(): - output_dim_(-1), dropout_proportion_(0.5) { } + output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { } DropoutMaskComponent::DropoutMaskComponent( const DropoutMaskComponent &other): output_dim_(other.output_dim_), - dropout_proportion_(other.dropout_proportion_) { } + dropout_proportion_(other.dropout_proportion_), + continuous_(other.continuous_) { } void* DropoutMaskComponent::Propagate( const ComponentPrecomputedIndexes *indexes, @@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate( out->Set(1.0); return NULL; } + + if (continuous_) { + if (test_mode_) { + out->Set(1.0); + } else { + const_cast&>(random_generator_).RandUniform(out); + out->Scale(dropout_proportion * 4.0); + // make the expected value 1.0. + out->Add(1.0 - (2.0 * dropout_proportion)); + } + return NULL; + } + if (test_mode_) { out->Set(1.0 - dropout_proportion); return NULL; } + const_cast&>(random_generator_).RandUniform(out); out->Add(-dropout_proportion); out->ApplyHeaviside(); - // To generate data where it's never the case that both of the dimensions - // for a row are zero, we generate uniformly distributed data (call this u_i), - // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) - // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) - int32 num_rows = out->NumRows(); - // later we may make this a bit more efficient. - CuVector temp(num_rows, kUndefined); - const_cast&>(random_generator_).RandUniform(&temp); - temp.Add(-dropout_proportion); - out->CopyColFromVec(temp, 0); - temp.Add(-1.0 + (2.0 * dropout_proportion)); - // Now, 'temp' contains the original uniformly-distributed data plus - // -(1 - dropout_proportion). - temp.Scale(-1.0); - out->CopyColFromVec(temp, 1); - out->ApplyHeaviside(); + + if (out->NumCols() == 2 || out->NumCols() == 3) { + // This is a kind of special case relevant to LSTms. + // To generate data where it's never the case that both of the dimensions + // for a row are zero, we generate uniformly distributed data (call this u_i), + // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) + // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) + int32 num_rows = out->NumRows(); + // later we may make this a bit more efficient. + CuVector temp(num_rows, kUndefined); + const_cast&>(random_generator_).RandUniform(&temp); + temp.Add(-dropout_proportion); + out->CopyColFromVec(temp, 0); + temp.Add(-1.0 + (2.0 * dropout_proportion)); + // Now, 'temp' contains the original uniformly-distributed data plus + // -(1 - dropout_proportion). + temp.Scale(-1.0); + out->CopyColFromVec(temp, 1); + out->ApplyHeaviside(); + } return NULL; } @@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &output_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dropout_proportion_); - std::string token; - ReadToken(is, binary, &token); - if (token == "") { + if (PeekToken(is, binary) == 'T') { + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &test_mode_); // read test mode - ExpectToken(is, binary, ""); } else { test_mode_ = false; - KALDI_ASSERT(token == ""); } + if (PeekToken(is, binary) == 'C') { + ExpectToken(is, binary, ""); + continuous_ = true; + } else { + continuous_ = false; + } + ExpectToken(is, binary, ""); } @@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, dropout_proportion_); WriteToken(os, binary, ""); WriteBasicType(os, binary, test_mode_); + if (continuous_) + WriteToken(os, binary, ""); WriteToken(os, binary, ""); } @@ -1480,11 +1507,280 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) { KALDI_ASSERT(ok && output_dim_ > 0); dropout_proportion_ = 0.5; cfl->GetValue("dropout-proportion", &dropout_proportion_); + continuous_ = false; + cfl->GetValue("continuous", &continuous_); test_mode_ = false; cfl->GetValue("test-mode", &test_mode_); } +std::string GeneralDropoutComponent::Info() const { + std::ostringstream stream; + stream << Type() + << ", dim=" << dim_ + << ", block-dim=" << block_dim_ + << ", dropout-proportion=" << dropout_proportion_; + if (continuous_) + stream << ", continuous=true"; + if (time_period_ > 0) + stream << ", time-period=" << time_period_; + return stream.str(); +} + +GeneralDropoutComponent::GeneralDropoutComponent(): + dim_(-1), block_dim_(-1), time_period_(0), + dropout_proportion_(0.5), continuous_(false) { } + +GeneralDropoutComponent::GeneralDropoutComponent( + const GeneralDropoutComponent &other): + dim_(other.dim_), + block_dim_(other.block_dim_), + time_period_(other.time_period_), + dropout_proportion_(other.dropout_proportion_), + continuous_(other.continuous_) { } + +void* GeneralDropoutComponent::Propagate( + const ComponentPrecomputedIndexes *indexes_in, + const CuMatrixBase &in, + CuMatrixBase *out) const { + + KALDI_ASSERT(SameDim(in, *out)); + + // The following will do nothing if 'out' and 'in' refer to the same data. + out->CopyFromMat(in); + + if (test_mode_ || dropout_proportion_ == 0.0) + return NULL; + + const GeneralDropoutComponentPrecomputedIndexes *indexes = + dynamic_cast(indexes_in); + KALDI_ASSERT(indexes != NULL); + + CuMatrix *mask = GetMemo(indexes->num_mask_rows); + + if (block_dim_ < dim_) { + KALDI_ASSERT(out->Stride() == out->NumCols()); + int32 num_rows = out->NumRows(), + dim_multiple = dim_ / block_dim_, + num_rows_reshaped = num_rows * dim_multiple; + CuSubMatrix out_reshaped(out->Data(), block_dim_, + num_rows_reshaped, + num_rows_reshaped); + out_reshaped.MulRows(*mask, indexes->indexes); + } else { + out->MulRows(*mask, indexes->indexes); + } + return mask; +} + +void GeneralDropoutComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes_in, + const CuMatrixBase &, // in_value + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + KALDI_ASSERT(in_deriv != NULL && SameDim(*in_deriv, out_deriv)); + + // The following will do no work if in_deriv->Data() == out_deriv.Data(). + in_deriv->CopyFromMat(out_deriv); + + if (test_mode_ || dropout_proportion_ == 0.0) { + KALDI_ASSERT(memo == NULL); + return; + } + + const GeneralDropoutComponentPrecomputedIndexes *indexes = + dynamic_cast(indexes_in); + KALDI_ASSERT(indexes != NULL && memo != NULL); + CuMatrix *mask = reinterpret_cast*>(memo); + + if (block_dim_ < dim_) { + KALDI_ASSERT(in_deriv->Stride() == in_deriv->NumCols()); + int32 num_rows = in_deriv->NumRows(), + dim_multiple = dim_ / block_dim_, + num_rows_reshaped = num_rows * dim_multiple; + CuSubMatrix in_deriv_reshaped(in_deriv->Data(), block_dim_, + num_rows_reshaped, + num_rows_reshaped); + in_deriv_reshaped.MulRows(*mask, indexes->indexes); + } else { + in_deriv->MulRows(*mask, indexes->indexes); + } +} + +void GeneralDropoutComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &block_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &time_period_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dropout_proportion_); + if (PeekToken(is, binary) == 'T') { + ExpectToken(is, binary, ""); + test_mode_ = true; + } else { + test_mode_ = false; + } + if (PeekToken(is, binary) == 'C') { + ExpectToken(is, binary, ""); + continuous_ = true; + } else { + continuous_ = false; + } + ExpectToken(is, binary, ""); +} + + +void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, block_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, time_period_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dropout_proportion_); + if (test_mode_) + WriteToken(os, binary, ""); + if (continuous_) + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); +} + +Component* GeneralDropoutComponent::Copy() const { + return new GeneralDropoutComponent(*this); +} + +void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) { + dim_ = 0; + bool ok = cfl->GetValue("dim", &dim_); + KALDI_ASSERT(ok && dim_ > 0); + block_dim_ = dim_; + cfl->GetValue("block-dim", &block_dim_); + if (!(block_dim_ > 0 && dim_ % block_dim_ == 0)) + KALDI_ERR << "Invalid configuration dim=" << dim_ + << ", block-dim=" << block_dim_; + time_period_ = 0; + cfl->GetValue("time-period", &time_period_); + dropout_proportion_ = 0.5; + cfl->GetValue("dropout-proportion", &dropout_proportion_); + continuous_ = false; + cfl->GetValue("continuous", &continuous_); + test_mode_ = false; + cfl->GetValue("test-mode", &test_mode_); +} + + +CuMatrix* GeneralDropoutComponent::GetMemo( + int32 num_mask_rows) const { + KALDI_ASSERT(num_mask_rows > 0 && !test_mode_ && + dropout_proportion_ > 0.0); + CuMatrix *ans = new CuMatrix(num_mask_rows, block_dim_); + BaseFloat dropout_proportion = dropout_proportion_; + + // This const_cast is only safe assuming you don't attempt + // to use multi-threaded code with the GPU. + const_cast&>(random_generator_).RandUniform(ans); + + if (!continuous_) { + ans->Add(-dropout_proportion); + // now, a proportion "dropout_proportion" will be < 0.0. After applying the + // function (x>0?1:0), a proportion "dropout_proportion" will be zero and (1 - + // dropout_proportion) will be 1.0. + ans->ApplyHeaviside(); + ans->Scale(1.0 / dropout_proportion); + } else { + ans->Scale(dropout_proportion * 4.0); + // make the expected value 1.0. + ans->Add(1.0 - (2.0 * dropout_proportion)); + } + return ans; +} + +ComponentPrecomputedIndexes* GeneralDropoutComponent::PrecomputeIndexes( + const MiscComputationInfo &misc_info, + const std::vector &input_indexes, + const std::vector &output_indexes, + bool need_backprop) const { + KALDI_ASSERT(input_indexes == output_indexes); + + GeneralDropoutComponentPrecomputedIndexes *ans = new + GeneralDropoutComponentPrecomputedIndexes; + int32 size = input_indexes.size(), time_period = time_period_, + cur_row = 0; + std::vector indexes(size); + // the map 'm' will map from a pair from (n, t) value to the row-index of the + // dropout-mask matrix*. However, the 't' isn't a real 't' value; + // if time_period_ == 0, the 't' value will just be zero; otherwise, + // it will be t divided by time_period_ (rounding towards negative infinity). + + // *before considering effects related to when block_dim_ != dim_. + + std::unordered_map, int32, PairHasher > m; + for (int32 i = 0; i < size; i++) { + int32 n = input_indexes[i].n, + t = (time_period == 0 ? 0 : DivideRoundingDown(input_indexes[i].t, + time_period)); + std::pair p(n, t); + + std::unordered_map, int32, + PairHasher >::const_iterator + iter = m.find(p); + if (iter != m.end()) { + indexes[i] = iter->second; + } else { + m[p] = cur_row; + indexes[i] = cur_row; + cur_row++; + } + } + int32 multiple = dim_ / block_dim_; + ans->num_mask_rows = cur_row; + if (multiple == 1) { + ans->indexes.CopyFromVec(indexes); + } else { + ans->num_mask_rows = cur_row * multiple; + std::vector repeated_indexes; + repeated_indexes.reserve(size * multiple); + for (int32 i = 0; i < size; i++) { + int32 row = indexes[i]; + for (int32 j = 0; j < multiple; j++) + repeated_indexes.push_back(row); + } + ans->indexes.CopyFromVec(repeated_indexes); + } + return ans; +} + +void GeneralDropoutComponentPrecomputedIndexes::Write(std::ostream &os, + bool binary) const { + WriteToken(os, binary, + ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_mask_rows); + WriteToken(os, binary, ""); + indexes.Write(os, binary); + WriteToken(os, binary, + ""); +} + +void GeneralDropoutComponentPrecomputedIndexes::Read(std::istream &is, + bool binary) { + ExpectOneOrTwoTokens(is, binary, + "", + ""); + ReadBasicType(is, binary, &num_mask_rows); + ExpectToken(is, binary, ""); + indexes.Read(is, binary); + ExpectToken(is, binary, + ""); +} } // namespace nnet3 diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index 36829329d66..cff73a55b59 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent { virtual std::string Info() const; // possible parameter values with their defaults: - // dropout-proportion=0.5 output-dim=-1 + // dropout-proportion=0.5 output-dim=-1 continuous=false + // With the 'continous=false' option (the default), it generates + // 0 with probability 'dropout-proportion' and 1 otherwise. + // With 'continuous=true' it outputs 1 plus dropout-proportion times + // a value uniformly distributed on [-2, 2]. (e.g. if dropout-proportion is + // 0.5, this would amount to a value uniformly distributed on [0,2].) virtual void InitFromConfig(ConfigLine *cfl); DropoutMaskComponent(); @@ -771,12 +776,182 @@ class DropoutMaskComponent: public RandomComponent { BaseFloat dropout_proportion_; + bool continuous_; + const DropoutMaskComponent &operator = (const DropoutMaskComponent &other); // Disallow. }; +/** + GeneralDropoutComponent implements dropout, including a continuous + variant where the thing we multiply is not just zero or one, but may + be a continuous value. It is intended for the case where you want to + either share the dropout mask across all of time, or across groups + of 't' values (e.g. the first block of 10 values gets one dropout + mask, the second block of 10 gets another one, and so on). + + + Configuration values accepted on the command line, with defaults: + + dim Dimension of the input and output of this component, + e.g. 512 + + block-dim Block size if you want the dropout mask to repeat, + e.g. if dim=512 and you sent block-dim=128, there will + be a mask of dimension 128 repeated 4 times. This can + be useful in convolutional setups. If not specified, + block-dim defaults to 'dim'; if specified, it must be + a divisor of 'dim'. + + dropout-proportion=0.5 For conventional dropout, this is the proportion + of mask values that (in expectation) are zero; it would + normally be between 0 and 0.5. The nonzero mask values + will be given values 1.0 / dropout_proportion, so that the + expected value is 1.0. This behavior is different from + DropoutComponent and DropoutMaskComponent. + + For continuous dropout (continuous==true), the dropout scales + will have values (1.0 + 2 * dropout-proportion * + Uniform[-1,1]). This might seem like a strange choice, but it + means that dropout-proportion=0.5 gives us a kind of + 'extremal' case where the dropout scales are distributed as + Uniform[0, 2] and we can pass in the dropout scale as if it + were a conventional dropout scale. + + time-period=0 This determines how the dropout mask interacts + with the time index (t). In all cases, different sequences + (different 'n' values) get different dropout masks. + If time-period==0, then the dropout mask is shared across + all time values. If you set time-period > 0, then the + dropout mask is shared across blocks of time values: for + instance if time-period==10, then we'll use one dropout + mask for t values 0 through 9, another for 10 through 19, + and so on. In all cases, the dropout mask will be shared + across all 'x' values, although in most setups the x values + are just zero so this isn't very interesting. + If you set time-period==1 it would be similar to regular + dropout, and it would probably make more sense to just use the + normal DropoutComponent. + + */ +class GeneralDropoutComponent: public RandomComponent { + public: + virtual int32 InputDim() const { return dim_; } + + virtual int32 OutputDim() const { return dim_; } + + virtual std::string Info() const; + + virtual void InitFromConfig(ConfigLine *cfl); + + GeneralDropoutComponent(); + + GeneralDropoutComponent(const GeneralDropoutComponent &other); + + virtual std::string Type() const { return "GeneralDropoutComponent"; } + virtual int32 Properties() const { + return kRandomComponent|kPropagateInPlace|kBackpropInPlace|kUsesMemo| + (block_dim_ != dim_ ? (kInputContiguous|kOutputContiguous) : 0); + } + + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, // in_value + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual void DeleteMemo(void *memo) const { + delete static_cast*>(memo); + } + + virtual ComponentPrecomputedIndexes* PrecomputeIndexes( + const MiscComputationInfo &misc_info, + const std::vector &input_indexes, + const std::vector &output_indexes, + bool need_backprop) const; + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + + virtual Component* Copy() const; + + void SetDropoutProportion(BaseFloat p) { dropout_proportion_ = p; } + + private: + + // Returns a random matrix of dimension 'num_mask_rows' by 'block_dim_'. This + // should not be called if test_mode_ is true or dropout_proportion_ is zero. + CuMatrix *GetMemo(int32 num_mask_rows) const; + + + // The input and output dimension + int32 dim_; + + // block_dim_ must divide dim_. + int32 block_dim_; + + // time_period_ can be zero if we want all 't' values to share the same + // dropout mask, and a value more than zero if we want blocks of 't' values to + // share the dropout mask. For example, if time_period_ is 10, blocks of size + // 10 frames will share the same dropout mask. + int32 time_period_; + + BaseFloat dropout_proportion_; + + bool continuous_; + + bool test_mode_; + + const GeneralDropoutComponent &operator + = (const GeneralDropoutComponent &other); // Disallow. +}; + +// This stores some precomputed indexes for GeneralDropoutComponent. +// This object is created for every instance of the Propagate() +// function in the compiled computation. +class GeneralDropoutComponentPrecomputedIndexes: + public ComponentPrecomputedIndexes { + public: + + + // num_mask_rows is the number of rows in the dropout-mask matrix; + // it's num-cols is the block_dim_ of the component. + int32 num_mask_rows; + + // 'indexes' is of dimension (the number of rows in the matrix we're doing + // Propagate() or Backprop() on) times the (dim_ / block_dim_) of the + // GeneralDropoutComponent. Each value is in the range [0, num_mask_rows-1], + // and each value is repeated (dim_ / block_dim_) times. This array is used + // to multiply the reshaped values or derivatives by the appropriate rows of + // the dropout matrix. + CuArray indexes; + + virtual ~GeneralDropoutComponentPrecomputedIndexes() { } + + ComponentPrecomputedIndexes *Copy() const { + return new GeneralDropoutComponentPrecomputedIndexes(*this); + } + + virtual void Write(std::ostream &os, bool binary) const; + + virtual void Read(std::istream &is, bool binary); + + virtual std::string Type() const { + return "GeneralDropoutComponentPrecomputedIndexes"; + } +}; + + + + } // namespace nnet3 diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index 1806fe38493..37ad624d0f0 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -58,7 +58,7 @@ namespace nnet3 { Configuration values accepted: dim, or input-dim Input dimension of this component, e.g. 1024. Will be the same as the output dimension if add-log-stddev=false. - block-dim Defaults to 'dim' you may specify a nonzero divisor + block-dim Defaults to 'dim' you may specify a divisor of 'dim'. In this case the input dimension will be interpreted as blocks of dimension 'block-dim' to which the nonlinearity described above is applied @@ -144,11 +144,11 @@ class NormalizeComponent: public Component { Accepted configuration values: dim Dimension of the input and output - block-dim Defaults to 'dim', but may be set to a nonzero divisor + block-dim Defaults to 'dim', but may be set to a divisor of 'dim'. In this case, each block of dimension 'block-dim' is treated like a separate row of the input matrix, which means that the stats from n'th element of each - block are pooled into one class, for each n.a + block are pooled into one class, for each n. epsilon Small term added to the variance that is used to prevent division by zero target-rms This defaults to 1.0, but if set, for instance, to 2.0, diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index b1eb30a55bf..2d776180533 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent { // it contains the 3 diagonal parameter matrices w_i, w_f and w_o. CuMatrix params_; - // If true, we expect an extra 2 dimensions on the input, for dropout masks + // If true, we expect an extra 3 dimensions on the input, for dropout masks // for i_t and f_t. bool use_dropout_; diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 6bff30c501b..812b66c41b1 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -90,7 +90,7 @@ void NnetTrainer::TrainInternal(const NnetExample &eg, const NnetComputation &computation) { // note: because we give the 1st arg (nnet_) as a pointer to the // constructor of 'computer', it will use that copy of the nnet to - // store stats. This is mainly important for memory-norm. + // store stats. NnetComputer computer(config_.compute_config, computation, nnet_, delta_nnet_); // give the inputs to the computer object. @@ -131,7 +131,7 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg, bool is_backstitch_step1) { // note: because we give the 1st arg (nnet_) as a pointer to the // constructor of 'computer', it will use that copy of the nnet to - // store stats. This is mainly important for memory-norm. + // store stats. NnetComputer computer(config_.compute_config, computation, nnet_, delta_nnet_); // give the inputs to the computer object. diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index fd2229cace8..afe624f94ca 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -486,6 +486,10 @@ void SetDropoutProportion(BaseFloat dropout_proportion, dynamic_cast(nnet->GetComponent(c)); if (mc != NULL) mc->SetDropoutProportion(dropout_proportion); + GeneralDropoutComponent *gdc = + dynamic_cast(nnet->GetComponent(c)); + if (gdc != NULL) + gdc->SetDropoutProportion(dropout_proportion); } } @@ -1172,12 +1176,17 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) { dynamic_cast(nnet->GetComponent(c)); DropoutMaskComponent *mask_component = dynamic_cast(nnet->GetComponent(c)); + GeneralDropoutComponent *general_dropout_component = + dynamic_cast(nnet->GetComponent(c)); if (dropout_component != NULL) { dropout_component->SetDropoutProportion(proportion); num_dropout_proportions_set++; } else if (mask_component != NULL){ mask_component->SetDropoutProportion(proportion); num_dropout_proportions_set++; + } else if (general_dropout_component != NULL){ + general_dropout_component->SetDropoutProportion(proportion); + num_dropout_proportions_set++; } } } @@ -1461,9 +1470,10 @@ class ModelCollapser { /** Tries to produce a component that's equivalent to running the component 'component_index2' with input given by 'component_index1'. This handles - the case where 'component_index1' is of type DropoutComponent, and where - 'component_index2' is of type AffineComponent, - NaturalGradientAffineComponent or TimeHeightConvolutionComponent. + the case where 'component_index1' is of type DropoutComponent or + GeneralDropoutComponent, and where 'component_index2' is of type + AffineComponent, NaturalGradientAffineComponent or + TimeHeightConvolutionComponent. Returns -1 if this code can't produce a combined component (normally because the components have the wrong types). @@ -1473,10 +1483,23 @@ class ModelCollapser { const DropoutComponent *dropout_component = dynamic_cast( nnet_->GetComponent(component_index1)); - if (dropout_component == NULL) + const GeneralDropoutComponent *general_dropout_component = + dynamic_cast( + nnet_->GetComponent(component_index1)); + + if (dropout_component == NULL && general_dropout_component == NULL) return -1; - BaseFloat dropout_proportion = dropout_component->DropoutProportion(); - BaseFloat scale = 1.0 / (1.0 - dropout_proportion); + BaseFloat scale; // the scale we have to apply to correct for removing + // this dropout comonent. + if (dropout_component != NULL) { + BaseFloat dropout_proportion = dropout_component->DropoutProportion(); + scale = 1.0 / (1.0 - dropout_proportion); + } else { + // for GeneralDropoutComponent, it's done in such a way that the expectation + // is always 1. (When it's nonzero, we give it a value 1/(1-dropout_proportion). + // So no scaling is needed. + scale = 1.0; + } // note: if the 2nd component is not of a type that we can scale, the // following function call will return -1, which is OK. return GetScaledComponentIndex(component_index2, diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index efa36e1f64c..4b105e30beb 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -189,7 +189,7 @@ void RecomputeStats(const std::vector &egs, Nnet *nnet); /// This function affects components of child-classes of -/// RandomComponent( currently only DropoutComponent and DropoutMaskComponent). +/// RandomComponent. /// It sets "test mode" on such components (if you call it with test_mode = /// true, otherwise it would set normal mode, but this wouldn't be needed often). /// "test mode" means that having a mask containing (1-dropout_prob) in all @@ -296,7 +296,8 @@ void CollapseModel(const CollapseModelConfig &config, 'remove-orphans'. set-dropout-proportion [name=] proportion= - Sets the dropout rates for any components of type DropoutComponent whose + Sets the dropout rates for any components of type DropoutComponent, + DropoutMaskComponent or GeneralDropoutComponent whose names match the given (e.g. lstm*). defaults to "*". apply-svd name= bottleneck-dim=