diff --git a/egs/mini_librispeech/s5/local/chain/compare_wer.sh b/egs/mini_librispeech/s5/local/chain/compare_wer.sh index cd6be14ed88..8ee5db2326a 100755 --- a/egs/mini_librispeech/s5/local/chain/compare_wer.sh +++ b/egs/mini_librispeech/s5/local/chain/compare_wer.sh @@ -129,3 +129,9 @@ for x in $*; do printf "% 10s" $prob done echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh index 75da1a0a553..cb5756188a4 120000 --- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh +++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1e.sh \ No newline at end of file +tuning/run_tdnn_1f.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh new file mode 100755 index 00000000000..9cc6d93022a --- /dev/null +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh @@ -0,0 +1,311 @@ +#!/bin/bash + + +# 1f is as 1e but a smaller model with various tuning changes, the most +# important of which is the 'bottleneck-dim' option for the last layer; +# also dimensions are reduced and we've removed the 'target-rms=0.5' options +# on the prefinal layers. +# +# local/chain/compare_wer.sh --online exp/chain/tdnn1{e,f}_sp 2>/dev/null +# local/chain/compare_wer.sh --online exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp +# System tdnn1e_sp tdnn1f_sp +#WER dev_clean_2 (tgsmall) 14.11 13.91 +# [online:] 14.07 13.96 +#WER dev_clean_2 (tglarge) 10.15 9.95 +# [online:] 10.16 10.13 +# Final train prob -0.0503 -0.0508 +# Final valid prob -0.0887 -0.0917 +# Final train prob (xent) -1.4257 -1.3509 +# Final valid prob (xent) -1.6799 -1.5883 +# Num-params 7508490 4205322 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f}_sp +# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.057->-0.057 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.46,-1.43/-1.94,-1.72,-1.68) logprob:train/valid[10,16,final]=(-0.067,-0.055,-0.050/-0.105,-0.095,-0.089) +# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 2) xent:train/valid[10,16,final]=(-1.60,-1.39,-1.35/-1.81,-1.64,-1.59) logprob:train/valid[10,16,final]=(-0.068,-0.056,-0.051/-0.104,-0.097,-0.092) + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1f # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.02 bottleneck-dim=192" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=384 + relu-batchnorm-layer name=tdnn2 $opts dim=384 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=384 + relu-batchnorm-layer name=tdnn4 $opts dim=384 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=384 + relu-batchnorm-layer name=tdnn6 $opts dim=384 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=384 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=384 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=384 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true - for decode_set in train_dev eval2000; do + for decode_set in train_dev eval2000 $maybe_rt03; do ( steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ @@ -243,7 +244,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then $lang exp/nnet3/extractor $dir ${dir}_online rm $dir/.error 2>/dev/null || true - for decode_set in train_dev eval2000; do + for decode_set in train_dev eval2000 $maybe_rt03; do ( # note: we just give it "$decode_set" as it only uses the wav.scp, the # feature type does not matter. diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh new file mode 100755 index 00000000000..cf4855db611 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh @@ -0,0 +1,274 @@ +#!/bin/bash + + +# 7n is a kind of factorized TDNN, with skip connections. We have to write +# a proper description for this. Note: I'm not happy with how + +# The following compares this with our old tdnn_lstm system before kaldi 5.4 +# (from run_tdnn_lstm_1m.sh), and with our old TDNN system. It's over 1.5% +# absolute better than our old TDNN system, and even a bit better than our old +# TDNN+LSTM with dropout. +# +# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn_7m_sp tdnn7n_sp +# System tdnn_lstm_1m_ld5_sp tdnn_7m_sp tdnn7n_sp +# WER on train_dev(tg) 12.33 13.70 12.18 +# WER on train_dev(fg) 11.42 12.67 11.12 +# WER on eval2000(tg) 15.2 16.6 14.9 +# WER on eval2000(fg) 13.8 15.1 13.5 +# WER on rt03(tg) 18.6 20.9 18.4 +# WER on rt03(fg) 16.3 18.3 16.2 +# Final train prob -0.082 -0.085 -0.077 +# Final valid prob -0.099 -0.103 -0.093 +# Final train prob (xent) -0.959 -1.230 -0.994 +# Final valid prob (xent) -1.0305 -1.2704 -1.0194 +# Num-parameters 39558436 16292693 20111396 + + + +# steps/info/chain_dir_info.pl exp/chain/tdnn7m23t_sp +# exp/chain/tdnn7m23t_sp: num-iters=394 nj=3..16 num-params=20.1M dim=40+100->6034 combine=-0.083->-0.081 (over 20) xent:train/valid[261,393,final]=(-1.05,-0.991,-0.994/-1.09,-1.02,-1.02) logprob:train/valid[261,393,final]=(-0.085,-0.077,-0.077/-0.100,-0.095,-0.093) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7n +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh index 1d566290163..b50692616c4 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh @@ -6,11 +6,14 @@ # After comparing different combinations of dropout(with or without) and decay-time # option(20, 40 or without), we found this setup is best. -#System tdnn_lstm_1l_ld5 tdnn_lstm_1m_ld 1m_online +#System tdnn_lstm_1l_ld5 tdnn_lstm_1m_ld 1m_online #WER on train_dev(tg) 12.41 12.37 12.21 #WER on train_dev(fg) 11.59 11.46 11.41 #WER on eval2000(tg) 14.8 14.8 14.9 #WER on eval2000(fg) 13.5 13.5 13.6 +# WER on rt03(tg) 18.6 +# WER on rt03(fg) 16.3 + #Final train prob -0.069 -0.081 #Final valid prob -0.095 -0.100 #Final train prob (xent) -0.913 -0.950 @@ -30,6 +33,7 @@ dir=exp/chain/tdnn_lstm_1m # Note: _sp will get added to this if $speed_perturb decode_iter= decode_dir_affix= decode_nj=50 +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi # training options leftmost_questions_truncate=-1 @@ -227,7 +231,7 @@ if [ $stage -le 15 ]; then if [ ! -z $decode_iter ]; then iter_opts=" --iter $decode_iter " fi - for decode_set in train_dev eval2000; do + for decode_set in train_dev eval2000 $maybe_rt03; do ( steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj 50 --cmd "$decode_cmd" $iter_opts \ @@ -257,7 +261,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then $lang exp/nnet3/extractor $dir ${dir}_online rm $dir/.error 2>/dev/null || true - for decode_set in train_dev eval2000; do + for decode_set in train_dev eval2000 $maybe_rt03; do ( # note: we just give it "$decode_set" as it only uses the wav.scp, the # feature type does not matter. diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh new file mode 100755 index 00000000000..9cb182b2915 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh @@ -0,0 +1,284 @@ +#!/bin/bash + + +# 1n is as 1m but with significant changes, replacing TDNN layers with a +# structure like run_tdnn_7n.sh. Seems better! But the improvement +# versus the best TDNN system (see run_tdnn_7n.sh) is so small that it's +# not really worth it when you consider how much slower it is. + +# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1n_sp tdnn_lstm1n_sp_online +# System tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1n_sp tdnn_lstm1n_sp_online +# WER on train_dev(tg) 12.33 12.21 12.38 12.49 +# WER on train_dev(fg) 11.42 11.41 11.48 11.59 +# WER on eval2000(tg) 15.2 15.1 15.0 14.9 +# WER on eval2000(fg) 13.8 13.8 13.5 13.5 +# WER on rt03(tg) 18.6 18.4 18.0 18.0 +# WER on rt03(fg) 16.3 16.1 15.8 15.8 +# Final train prob -0.082 0.000 -0.084 0.000 +# Final valid prob -0.099 0.000 -0.104 0.000 +# Final train prob (xent) -0.959 0.000 -1.154 0.000 +# Final valid prob (xent) -1.0305 0.0000 -1.2190 0.0000 +# Num-parameters 39558436 0 27773348 0 +# + + +# exp/chain/tdnn_lstm1n_sp: num-iters=394 nj=3..16 num-params=27.8M dim=40+100->6034 combine=-0.081->-0.080 (over 5) xent:train/valid[261,393,final]=(-1.59,-1.14,-1.15/-1.64,-1.22,-1.22) logprob:train/valid[261,393,final]=(-0.105,-0.086,-0.084/-0.123,-0.107,-0.104) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=1n +decode_iter= +decode_dir_affix= +decode_nj=50 +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +# training options +frames_per_chunk=140,100,160 +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +remove_egs=true +common_egs_dir= + +test_online_decoding=true # if true, it will run the last decoding stage. +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + lstm_opts="l2-regularize=0.0005 decay-time=40" + output_opts="l2-regularize=0.0005 output-delay=$label_delay max-change=1.5 dim=$num_targets" + + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=lstm1l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=lstm2l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=lstm3l dim=256 $linear_opts input=Append(-3,0) + fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts + + output-layer name=output input=lstm3 include-log-softmax=false $output_opts + + output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh index 00b2d29cc88..88dde1ff0e2 100755 --- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh +++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh @@ -102,5 +102,10 @@ for x in $*; do prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') printf "% 10s" $prob done +echo +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done echo diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh index 75da1a0a553..cb5756188a4 120000 --- a/egs/wsj/s5/local/chain/run_tdnn.sh +++ b/egs/wsj/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1e.sh \ No newline at end of file +tuning/run_tdnn_1f.sh \ No newline at end of file diff --git a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh index 8e647598556..a4fa11e0908 120000 --- a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh +++ b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh @@ -1 +1 @@ -tuning/run_tdnn_lstm_1a.sh \ No newline at end of file +tuning/run_tdnn_lstm_1b.sh \ No newline at end of file diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh new file mode 100755 index 00000000000..be8d39de80b --- /dev/null +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh @@ -0,0 +1,342 @@ +#!/bin/bash + +# 1f is as 1e but a re-tuned model with fewer parameters and a bottleneck at the +# end, and no chain l2-regularize +#[note: was 1e12e.] + +# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1e12e_sp +# System tdnn1e10_sp tdnn1e12e_sp +#WER dev93 (tgpr) 7.29 7.20 +#WER dev93 (tg) 7.08 6.81 +#WER dev93 (big-dict,tgpr) 5.15 5.04 +#WER dev93 (big-dict,fg) 4.52 4.42 +#WER eval92 (tgpr) 5.12 4.80 +#WER eval92 (tg) 4.91 4.54 +#WER eval92 (big-dict,tgpr) 2.94 2.76 +#WER eval92 (big-dict,fg) 2.57 2.30 +# Final train prob -0.0545 -0.0455 +# Final valid prob -0.0650 -0.0599 +# Final train prob (xent) -0.9696 -0.9060 +# Final valid prob (xent) -0.9917 -0.9448 +# Num-params 8067660 6071244 + + +# exp/chain/tdnn1e_sp: num-iters=72 nj=2..8 num-params=8.1M dim=40+100->2854 combine=-0.064->-0.063 (over 3) xent:train/valid[47,71,final]=(-1.07,-0.973,-0.970/-1.08,-0.992,-0.992) logprob:train/valid[47,71,final]=(-0.064,-0.056,-0.054/-0.072,-0.066,-0.065) +# exp/chain/tdnn1f_sp: num-iters=72 nj=2..8 num-params=6.1M dim=40+100->2854 combine=-0.061->-0.061 (over 2) xent:train/valid[47,71,final]=(-1.04,-0.911,-0.910/-1.06,-0.953,-0.952) logprob:train/valid[47,71,final]=(-0.063,-0.052,-0.051/-0.071,-0.064,-0.064) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1f #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005 bottleneck-dim=320" + + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=448 + relu-batchnorm-layer name=tdnn2 $opts dim=448 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=448 + relu-batchnorm-layer name=tdnn4 $opts dim=448 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=448 + relu-batchnorm-layer name=tdnn6 $opts dim=448 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=448 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=448 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=448 + output-layer name=output $output_opts include-log-softmax=false dim=$num_targets + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent $opts input=tdnn8 dim=448 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l 6) +# 1b19 is a rerun of 1b18d3 (a fairly small LSTM+TDNN setup). +# +# +# 1b18d3 is as 1b18d2 but reducing lstm bottleneck dim from 304 to 256. +# [1b18d2 is just a rerun of 1b18d as I merged various code changes and +# I want to make sure nothing bad happened.] +# +# Results below show it's probably slightly better than the average of 18d and 18d2 +# (which are supposed to be the same experiment)... +# +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18d_sp exp/chain/tdnn_lstm1b18d2_sp exp/chain/tdnn_lstm1b18d3_sp +# System tdnn_lstm1b18d_sp tdnn_lstm1b18d2_sp tdnn_lstm1b18d3_sp +#WER dev93 (tgpr) 7.78 7.46 7.46 +#WER dev93 (tg) 7.29 7.30 7.04 +#WER dev93 (big-dict,tgpr) 5.56 5.51 5.55 +#WER dev93 (big-dict,fg) 5.32 5.08 5.05 +#WER eval92 (tgpr) 5.33 5.40 5.39 +#WER eval92 (tg) 5.05 5.03 4.96 +#WER eval92 (big-dict,tgpr) 3.42 3.26 3.35 +#WER eval92 (big-dict,fg) 2.91 2.64 2.82 +# Final train prob -0.0529 -0.0536 -0.0543 +# Final valid prob -0.0633 -0.0630 -0.0636 +# Final train prob (xent) -0.8327 -0.8330 -0.8415 +# Final valid prob (xent) -0.8693 -0.8672 -0.8695 +# Num-params 4922060 4922060 4805324 + +# +# 1b18d is as 1b18c, but adding 'self-scale=2.0' to scale up the m_trunc when it is given +# as input to the affine projections (I found previously this was helpful). +# .. Interesting: objf improves but WER is not better. +# +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18c_sp exp/chain/tdnn_lstm1b18d_sp +# System tdnn_lstm1b18c_sp tdnn_lstm1b18d_sp +#WER dev93 (tgpr) 7.77 7.78 +#WER dev93 (tg) 7.40 7.29 +#WER dev93 (big-dict,tgpr) 5.39 5.56 +#WER dev93 (big-dict,fg) 5.25 5.32 +#WER eval92 (tgpr) 5.48 5.33 +#WER eval92 (tg) 4.98 5.05 +#WER eval92 (big-dict,tgpr) 3.07 3.42 +#WER eval92 (big-dict,fg) 2.69 2.91 +# Final train prob -0.0546 -0.0529 +# Final valid prob -0.0641 -0.0633 +# Final train prob (xent) -0.8679 -0.8327 +# Final valid prob (xent) -0.8954 -0.8693 +# Num-params 4922060 4922060 + +# 1b18c is as 1b18b, but fixing a bug in the script whereby c instead of m had been used +# as input to the affine projections. + +# 1b18b is as 1b18, but doubling l2 regularization on the output +# and lstm layers, parts of them were training too slowly. +# +# 1b18 is as 1b17, but via script change, not using memory-norm (actually +# this is the same as 1b17d). +# I don't see any WER change, but objf is worse. + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b17_sp exp/chain/tdnn_lstm1b17d_sp exp/chain/tdnn_lstm1b18_sp +# System tdnn_lstm1b17_sp tdnn_lstm1b17d_sp tdnn_lstm1b18_sp +#WER dev93 (tgpr) 7.49 7.44 7.48 +#WER dev93 (tg) 7.18 7.13 7.19 +#WER dev93 (big-dict,tgpr) 5.50 5.34 5.48 +#WER dev93 (big-dict,fg) 5.11 5.15 5.04 +#WER eval92 (tgpr) 5.26 5.32 5.32 +#WER eval92 (tg) 5.00 4.94 5.03 +#WER eval92 (big-dict,tgpr) 3.24 3.28 3.26 +#WER eval92 (big-dict,fg) 2.82 2.80 2.84 +# Final train prob -0.0489 -0.0486 -0.0496 +# Final valid prob -0.0583 -0.0599 -0.0612 +# Final train prob (xent) -0.7550 -0.7809 -0.7749 +# Final valid prob (xent) -0.7988 -0.8121 -0.8131 +# Num-params 4922060 4922060 4922060 + +# 1b17 is as 1b13m, it's just a rerun after some code changes (adding +# diagonal natural gradient stuff) which should make no difference. +# Still seems to be working. + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp exp/chain/tdnn_lstm1b17_sp +# System tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp tdnn_lstm1b17_sp +#WER dev93 (tgpr) 7.86 7.43 7.49 +#WER dev93 (tg) 7.40 7.00 7.18 +#WER dev93 (big-dict,tgpr) 5.65 5.21 5.50 +#WER dev93 (big-dict,fg) 5.11 4.76 5.11 +#WER eval92 (tgpr) 5.64 5.39 5.26 +#WER eval92 (tg) 5.17 5.00 5.00 +#WER eval92 (big-dict,tgpr) 3.21 3.30 3.24 +#WER eval92 (big-dict,fg) 2.84 2.62 2.82 +# Final train prob -0.0469 -0.0516 -0.0489 +# Final valid prob -0.0601 -0.0607 -0.0583 +# Final train prob (xent) -0.7424 -0.7593 -0.7550 +# Final valid prob (xent) -0.7920 -0.7982 -0.7988 +# Num-params 5456076 4922060 4922060 + +# 1b13m is as 1b13l, but reverting the LSTM script "fix" (which actually +# made things worse), so the baseline is 1b13{c,d} (and the change versus +# c,d is to add bottleneck-dim=256). +# +# It's helpful: +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp +# System tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp +#WER dev93 (tgpr) 7.68 7.86 7.43 +#WER dev93 (tg) 7.34 7.40 7.00 +#WER dev93 (big-dict,tgpr) 5.42 5.65 5.21 +#WER dev93 (big-dict,fg) 5.05 5.11 4.76 +#WER eval92 (tgpr) 5.48 5.64 5.39 +#WER eval92 (tg) 5.26 5.17 5.00 +#WER eval92 (big-dict,tgpr) 3.23 3.21 3.30 +#WER eval92 (big-dict,fg) 2.82 2.84 2.62 +# Final train prob -0.0490 -0.0469 -0.0516 +# Final valid prob -0.0597 -0.0601 -0.0607 +# Final train prob (xent) -0.7549 -0.7424 -0.7593 +# Final valid prob (xent) -0.7910 -0.7920 -0.7982 +# Num-params 5456076 5456076 4922060 +# +# +# 1b13l is as 1b13k, but adding bottleneck-dim=256 to the output layers. +# Definitely helpful: + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13k_sp exp/chain/tdnn_lstm1b13l_sp +# System tdnn_lstm1b13k_sp tdnn_lstm1b13l_sp +#WER dev93 (tgpr) 7.94 7.46 +#WER dev93 (tg) 7.68 7.09 +#WER dev93 (big-dict,tgpr) 5.91 5.39 +#WER dev93 (big-dict,fg) 5.56 4.94 +#WER eval92 (tgpr) 5.65 5.44 +#WER eval92 (tg) 5.32 5.09 +#WER eval92 (big-dict,tgpr) 3.49 3.15 +#WER eval92 (big-dict,fg) 3.07 2.94 +# Final train prob -0.0491 -0.0513 +# Final valid prob -0.0600 -0.0599 +# Final train prob (xent) -0.7395 -0.7490 +# Final valid prob (xent) -0.7762 -0.7860 +# Num-params 5456076 4922060 + +# 1b13k is as 1b13d, but after a script fix: previously we were using the 'c' +# for the full-matrix part of the recurrence instead of the 'm'. + +# 1b13d is as 1b13c, but a rerun after fixing a code bug whereby the natural gradient +# for the LinearComponent was turned off by default when initializing from config. +# **Update: turns out there was no difference here, the code had been ignoring +# that config variable.** +# +# It seems to optimize better, although the WER change is unclear. However, it's +# interesting that the average objf in the individual training jobs (train.*.log) is not better- +# but in compute_prob_train.*.log it is. It seems that the natural gradient interacts +# well with model averaging, which is what we found previously in the NG paper. + + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp +# System tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp +#WER dev93 (tgpr) 7.68 7.86 +#WER dev93 (tg) 7.34 7.40 +#WER dev93 (big-dict,tgpr) 5.42 5.65 +#WER dev93 (big-dict,fg) 5.05 5.11 +#WER eval92 (tgpr) 5.48 5.64 +#WER eval92 (tg) 5.26 5.17 +#WER eval92 (big-dict,tgpr) 3.23 3.21 +#WER eval92 (big-dict,fg) 2.82 2.84 +# Final train prob -0.0490 -0.0469 +# Final valid prob -0.0597 -0.0601 +# Final train prob (xent) -0.7549 -0.7424 +# Final valid prob (xent) -0.7910 -0.7920 +# Num-params 5456076 5456076 +# +# +# 1b13c is as 1b13b, but after script change in which the lstmb layer was +# rewritten, adding memnorm and removing the scale of 4.0, along with some +# more minor changes and streamlining/removing options. +# +# 1b13b is as 1b13, but a rerun after merging with the memnorm-and-combine +# branch. Slight difference in num-params is because of 300 vs 304. + +# 1b13 is as 1b10 but reducing the bottleneck dim to 304 +# (because I want to get in the habit of using multiples of 8). +# WER seems improved. +# +# + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b10_sp exp/chain/tdnn_lstm1b13_sp +# System tdnn_lstm1b10_sp tdnn_lstm1b13_sp +#WER dev93 (tgpr) 7.87 7.63 +#WER dev93 (tg) 7.48 7.46 +#WER dev93 (big-dict,tgpr) 5.55 5.56 +#WER dev93 (big-dict,fg) 5.25 5.09 +#WER eval92 (tgpr) 5.44 5.48 +#WER eval92 (tg) 5.05 5.12 +#WER eval92 (big-dict,tgpr) 3.24 3.17 +#WER eval92 (big-dict,fg) 2.73 2.60 +# Final train prob -0.0463 -0.0470 +# Final valid prob -0.0561 -0.0565 +# Final train prob (xent) -0.7362 -0.7588 +# Final valid prob (xent) -0.7730 -0.7831 +# Num-params 5650636 5446348 + +# 1b10 is as 1b9 but reducing the cell and bottleneck dimension of LSTM layer from 512 to 384. +# Seems helpful on average-- nice! + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b9_sp exp/chain/tdnn_lstm1b10_sp +# System tdnn_lstm1b9_sp tdnn_lstm1b10_sp +#WER dev93 (tgpr) 7.74 7.87 +#WER dev93 (tg) 7.46 7.48 +#WER dev93 (big-dict,tgpr) 5.67 5.55 +#WER dev93 (big-dict,fg) 5.31 5.25 +#WER eval92 (tgpr) 5.60 5.44 +#WER eval92 (tg) 5.42 5.05 +#WER eval92 (big-dict,tgpr) 3.47 3.24 +#WER eval92 (big-dict,fg) 3.07 2.73 +# Final train prob -0.0413 -0.0463 +# Final valid prob -0.0543 -0.0561 +# Final train prob (xent) -0.6786 -0.7362 +# Final valid prob (xent) -0.7249 -0.7730 +# Num-params 7021644 5650636 + +# 1b9 is as 1b8 but adding batchnorm after the LSTM layer.. this is +# to correct an oversight. +# 1b8 is as 1b7 but with quite a few layers removed. WER effect is unclear. + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b7_sp exp/chain/tdnn_lstm1b8_sp +# System tdnn_lstm1b7_sp tdnn_lstm1b8_sp +#WER dev93 (tgpr) 7.31 7.60 +#WER dev93 (tg) 7.10 7.25 +#WER dev93 (big-dict,tgpr) 5.26 5.26 +#WER dev93 (big-dict,fg) 4.64 4.93 +#WER eval92 (tgpr) 5.48 5.32 +#WER eval92 (tg) 5.00 5.07 +#WER eval92 (big-dict,tgpr) 3.35 3.31 +#WER eval92 (big-dict,fg) 2.99 2.84 +# Final train prob -0.0483 -0.0533 +# Final valid prob -0.0573 -0.0627 +# Final train prob (xent) -0.7207 -0.8234 +# Final valid prob (xent) -0.7467 -0.8466 +# Num-params 11752524 7021644 + +# 1b7 is as 1b6 but adding self-stabilize=true and normalize-type=none; +# and after a script-level change that scale 'c' by 4 before giving it +# to the W_all_a matrix (to see where all this came from, look at run_tdnn_lstm_1b16.sh +# in the mini_librispeech setup, although by the time you see this, that may no longer exist). +# +# 1b6 is as 1b3 but replacing renorm with batchnorm for the TDNN layers, +# and adding batchnorm to the LSTMB layers. Effect on WER unclear but generally +# it's better. + + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1{a2,a3,b3,b6}_sp +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a2_sp exp/chain/tdnn_lstm1a3_sp exp/chain/tdnn_lstm1b3_sp exp/chain/tdnn_lstm1b6_sp +# System tdnn_lstm1a2_sp tdnn_lstm1a3_sp tdnn_lstm1b3_sp tdnn_lstm1b6_sp +#WER dev93 (tgpr) 7.47 7.65 7.26 7.32 +#WER dev93 (tg) 7.29 7.24 6.96 6.98 +#WER dev93 (big-dict,tgpr) 5.44 5.60 5.43 5.22 +#WER dev93 (big-dict,fg) 4.98 5.04 4.97 4.86 +#WER eval92 (tgpr) 5.78 5.21 5.30 5.14 +#WER eval92 (tg) 5.44 5.00 4.87 4.82 +#WER eval92 (big-dict,tgpr) 3.35 3.23 3.42 3.24 +#WER eval92 (big-dict,fg) 2.99 2.96 3.03 2.82 +# Final train prob -0.0447 -0.0410 -0.0484 -0.0503 +# Final valid prob -0.0566 -0.0518 -0.0594 -0.0599 +# Final train prob (xent) -0.6859 -0.6676 -0.7528 -0.7415 +# Final valid prob (xent) -0.7378 -0.7230 -0.8078 -0.7804 +# Num-params 9106252 9106252 11747916 11746380 + +# 1b3 is as 1a2 but with the same change as in a->b, replacing lstmp with lstmb +# 1a2 is as 1a but adding l2 regularization. + +# this is a TDNN+LSTM chain system. +# It was modified from local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh with +# reference to ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh. +# Note: we're using the same hidden-layer sizes as +# ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh despite the +# fact that we'd normally choose a smaller model for a setup with +# less data, because the Tedlium model was probably on the small side. +# Note: we normally use more parameters for LSTM-containing than TDNN-only +# systems. + +# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp +# exp/chain/tdnn_lstm1a_sp: num-iters=120 nj=2..10 num-params=9.1M dim=40+100->2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051) + +# The following compares: +# (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM) +# system. +# This is consistently better than the nnet3 TDNN+LSTM, but the +# difference with the chain TDNN is inconsistent. + +# local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp +# System tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp +#WER dev93 (tgpr) 8.54 7.87 7.48 +# [online:] 8.57 8.02 7.49 +#WER dev93 (tg) 8.25 7.61 7.41 +# [online:] 8.34 7.70 7.40 +#WER dev93 (big-dict,tgpr) 6.24 5.71 5.64 +# [online:] 6.40 5.60 5.70 +#WER dev93 (big-dict,fg) 5.70 5.10 5.40 +# [online:] 5.77 5.21 5.19 +#WER eval92 (tgpr) 6.52 5.23 5.67 +# [online:] 6.56 5.44 5.60 +#WER eval92 (tg) 6.13 4.87 5.46 +# [online:] 6.24 4.87 5.53 +#WER eval92 (big-dict,tgpr) 3.88 3.24 3.69 +# [online:] 3.88 3.31 3.63 +#WER eval92 (big-dict,fg) 3.38 2.71 3.28 +# [online:] 3.53 2.92 3.31 +# Final train prob -0.0414 -0.0341 +# Final valid prob -0.0634 -0.0506 +# Final train prob (xent) -0.8216 -0.5643 +# Final valid prob (xent) -0.9208 -0.6648 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +label_delay=8 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +chunk_left_context=40 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005 bottleneck-dim=256" + lstm_opts="l2-regularize=0.005 self-scale=2.0" + + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda delay=5 input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=448 + relu-batchnorm-layer name=tdnn2 $tdnn_opts dim=448 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $tdnn_opts dim=448 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 $tdnn_opts dim=448 input=Append(-3,0,3) + lstmb-layer name=lstm3 $lstm_opts cell-dim=384 bottleneck-dim=256 decay-time=20 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 $output_opts output-delay=$label_delay include-log-softmax=false dim=$num_targets + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 $output_opts output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l %.3f", $1, $2); } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) { close(F); - return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); + return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); } } } @@ -204,6 +204,9 @@ sub get_logprob_and_accuracy_info { if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) { $iter_to_train_logprob{$iter} = $1; $iter_to_train_penalty{$iter} = $2; + } elsif (m/Overall log-probability for 'output' is (\S+)/) { + $iter_to_train_logprob{$iter} = $1; + $iter_to_train_penalty{$iter} = 0.0; } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) { $iter_to_train_xent{$iter} = $1; } @@ -213,6 +216,9 @@ sub get_logprob_and_accuracy_info { if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) { $iter_to_valid_logprob{$iter} = $1; $iter_to_valid_penalty{$iter} = $2; + } elsif (m/Overall log-probability for 'output' is (\S+)/) { + $iter_to_valid_logprob{$iter} = $1; + $iter_to_valid_penalty{$iter} = 0.0; } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) { $iter_to_valid_xent{$iter} = $1; } diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index d5f2575d582..905edc1a78b 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -388,8 +388,8 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): " key {k} in both {tl} and {vl}".format( k=key, tl=train_prob_files, vl=valid_prob_files)) iters.sort() - return map(lambda x: (int(x), float(train_objf[x]), - float(valid_objf[x])), iters) + return list(map(lambda x: (int(x), float(train_objf[x]), + float(valid_objf[x])), iters)) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 5b640510ea1..3df2720b2c0 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -128,7 +128,7 @@ def train_new_models(dir, iter, srand, num_jobs, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, - frame_subsampling_factor, run_opts, + frame_subsampling_factor, run_opts, train_opts, backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from train_one_iteration(), this method trains new models @@ -184,7 +184,7 @@ def train_new_models(dir, iter, srand, num_jobs, --max-param-change={max_param_change} \ --backstitch-training-scale={backstitch_training_scale} \ --backstitch-training-interval={backstitch_training_interval} \ - --l2-regularize-factor={l2_regularize_factor} \ + --l2-regularize-factor={l2_regularize_factor} {train_opts} \ --srand={srand} \ "{raw_model}" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs \ @@ -201,6 +201,7 @@ def train_new_models(dir, iter, srand, num_jobs, deriv_time_opts=" ".join(deriv_time_opts), app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, + train_opts=train_opts, xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, cache_io_opts=cache_io_opts, parallel_train_opts=run_opts.parallel_train_opts, @@ -233,7 +234,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, - run_opts, dropout_edit_string="", + run_opts, dropout_edit_string="", train_opts="", backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -306,7 +307,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, frame_subsampling_factor=frame_subsampling_factor, - run_opts=run_opts, + run_opts=run_opts, train_opts=train_opts, # linearly increase backstitch_training_scale during the # first few iterations (hard-coded as 15) backstitch_training_scale=(backstitch_training_scale * @@ -387,8 +388,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats - lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), - range(1, num_lda_jobs + 1)) + lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), + range(1, num_lda_jobs + 1))) common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ @@ -480,14 +481,34 @@ def compute_progress(dir, iter, run_opts): common_lib.background_command( """{command} {dir}/log/progress.{iter}.log \ nnet3-am-info {model} '&&' \ - nnet3-show-progress --use-gpu=no \ - "nnet3-am-copy --raw=true {prev_model} - |" \ - "nnet3-am-copy --raw=true {model} - |" + nnet3-show-progress --use-gpu=no {prev_model} {model} """.format(command=run_opts.command, dir=dir, iter=iter, model=model, prev_model=prev_model)) + if iter % 10 == 0 and iter > 0: + # Every 10 iters, print some more detailed information. + # full_progress.X.log contains some diagnostics of the difference in + # parameters, printed in the same format as from nnet3-info. + common_lib.background_command( + """{command} {dir}/log/full_progress.{iter}.log \ + nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model} + """.format(command=run_opts.command, + dir=dir, + iter=iter, + model=model, + prev_model=prev_model)) + # full_info.X.log is just the nnet3-info of the model, with the --verbose=2 + # option which includes stats on the singular values of the parameter matrices. + common_lib.background_command( + """{command} {dir}/log/full_info.{iter}.log \ + nnet3-info --verbose=2 {model} + """.format(command=run_opts.command, + dir=dir, + iter=iter, + model=model)) + def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 2b4fdd92cec..443834fc161 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -531,7 +531,7 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power)) num_pdfs = len(pdf_counts) - scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales) + scaled_counts = list(map(lambda x: x * float(num_pdfs) / sum(scales), scales)) return scaled_counts @@ -903,6 +903,11 @@ def __init__(self, lstm*=0,0.2,0'. More general should precede less general patterns, as they are applied sequentially.""") + self.parser.add_argument("--trainer.add-option", type=str, + dest='train_opts', action='append', default=[], + help="""You can use this to add arbitrary options that + will be passed through to the core training code (nnet3-train + or nnet3-chain-train)""") self.parser.add_argument("--trainer.optimization.backstitch-training-scale", type=float, dest='backstitch_training_scale', default=0.0, help="""scale of parameters changes diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index 8bdcd160409..9dd12e63f52 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -9,6 +9,8 @@ network without transition model) with frame-level objectives. """ +from __future__ import print_statement +from __future__ import division import glob import logging import math @@ -31,7 +33,7 @@ def train_new_models(dir, iter, srand, num_jobs, image_augmentation_opts, run_opts, frames_per_eg=-1, min_deriv_time=None, max_deriv_time_relative=None, - use_multitask_egs=False, + use_multitask_egs=False, train_opts="", backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from train_one_iteration(), this model does one iteration of training with 'num_jobs' jobs, and writes files like @@ -91,7 +93,7 @@ def train_new_models(dir, iter, srand, num_jobs, archive_index = (k % num_archives) + 1 if not chunk_level_training: - frame = (k / num_archives + archive_index) % frames_per_eg + frame = (k // num_archives + archive_index) % frames_per_eg cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) @@ -142,7 +144,7 @@ def train_new_models(dir, iter, srand, num_jobs, --backstitch-training-scale={backstitch_training_scale} \ --l2-regularize-factor={l2_regularize_factor} \ --backstitch-training-interval={backstitch_training_interval} \ - --srand={srand} \ + --srand={srand} {train_opts} \ {deriv_time_opts} "{raw_model}" "{egs_rspecifier}" \ {dir}/{next_iter}.{job}.raw""".format( command=run_opts.command, @@ -157,6 +159,7 @@ def train_new_models(dir, iter, srand, num_jobs, l2_regularize_factor=1.0/num_jobs, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval, + train_opts=train_opts, deriv_time_opts=" ".join(deriv_time_opts), raw_model=raw_model_string, egs_rspecifier=egs_rspecifier), @@ -175,9 +178,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, run_opts, image_augmentation_opts=None, frames_per_eg=-1, min_deriv_time=None, max_deriv_time_relative=None, - shrinkage_value=1.0, dropout_edit_string="", - get_raw_nnet_from_am=True, - use_multitask_egs=False, + shrinkage_value=1.0, dropout_edit_string="", train_opts="", + get_raw_nnet_from_am=True, use_multitask_egs=False, backstitch_training_scale=0.0, backstitch_training_interval=1, compute_per_dim_accuracy=False): """ Called from steps/nnet3/train_*.py scripts for one iteration of neural @@ -277,6 +279,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, max_deriv_time_relative=max_deriv_time_relative, image_augmentation_opts=image_augmentation_opts, use_multitask_egs=use_multitask_egs, + train_opts=train_opts, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval) @@ -344,8 +347,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats - lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), - range(1, num_lda_jobs + 1)) + lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), + range(1, num_lda_jobs + 1))) common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ @@ -447,6 +450,29 @@ def compute_progress(dir, iter, egs_dir, ''.format(command=run_opts.command, dir=dir, iter=iter, model=model, prev_model=prev_model)) + if iter % 10 == 0 and iter > 0: + # Every 10 iters, print some more detailed information. + # full_progress.X.log contains some diagnostics of the difference in + # parameters, printed in the same format as from nnet3-info. + common_lib.background_command( + """{command} {dir}/log/full_progress.{iter}.log \ + nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model} + """.format(command=run_opts.command, + dir=dir, + iter=iter, + model=model, + prev_model=prev_model)) + # full_info.X.log is just the nnet3-info of the model, with the --verbose=2 + # option which includes stats on the singular values of the parameter matrices. + common_lib.background_command( + """{command} {dir}/log/full_info.{iter}.log \ + nnet3-info --verbose=2 {model} + """.format(command=run_opts.command, + dir=dir, + iter=iter, + model=model)) + + def combine_models(dir, num_iters, models_to_combine, egs_dir, minibatch_size_str, @@ -553,7 +579,7 @@ def get_realign_iters(realign_times, num_iters, + realign_time * math.pow(num_jobs_final, 2)) realign_iter = realign_iter - num_jobs_initial - realign_iter = realign_iter / (num_jobs_final - num_jobs_initial) + realign_iter = realign_iter // (num_jobs_final - num_jobs_initial) realign_iter = realign_iter * num_iters realign_iters.append(int(realign_iter)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 05ae5bcdc18..a3dfa89cf0e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -402,8 +402,7 @@ def get_full_config(self): # the input layers need to be printed in 'init.config' (which # initializes the neural network prior to the LDA), in 'ref.config', # which is a version of the config file used for getting left and right - # context (it doesn't read anything for the LDA-like transform and/or - # presoftmax-prior-scale components) + # context (it doesn't read anything for the LDA-like transform). # In 'full.config' we write everything, this is just for reference, # and also for cases where we don't use the LDA-like transform. ans = [] @@ -430,6 +429,9 @@ class XconfigOutputLayer(XconfigLayerBase): Parameters of the class, and their defaults: input='[-1]' : Descriptor giving the input of the layer. dim=None : Output dimension of layer, will normally equal the number of pdfs. + bottleneck-dim=None : Bottleneck dimension of layer: if supplied, instead of + an affine component we'll have a linear then affine, so a linear + bottleneck, with the linear part constrained to be orthonormal. include-log-softmax=true : setting it to false will omit the log-softmax component- useful for chain models. objective-type=linear : the only other choice currently is @@ -441,16 +443,6 @@ class XconfigOutputLayer(XconfigLayerBase): learning-rate-factor=(0.5/xent_regularize), normally learning-rate-factor=5.0 since xent_regularize is normally 0.1. - presoftmax-scale-file=None : If set, a filename for a vector that - will be used to scale the output of the affine component before the - log-softmax (if include-log-softmax=true), or before the output - (if not). This is helpful to avoid instability in training due to - some classes having much more data than others. The way we normally - create this vector is to take the priors of the classes to the - power -0.25 and rescale them so the average is 1.0. This factor - -0.25 is referred to as presoftmax_prior_scale_power in scripts. In - the scripts this would normally be set to - config_dir/presoftmax_prior_scale.vec max-change=1.5 : Can be used to change the max-change parameter in the affine component; this affects how much the matrix can change on each iteration. @@ -462,6 +454,9 @@ class XconfigOutputLayer(XconfigLayerBase): ng-affine-options='' : Can be used supply non-default options to the affine layer (intended for the natural gradient but can be an arbitrary string to be added to the config line. e.g. 'update-period=2'.). + ng-linear-options='' : Options, like ng-affine-options, that are passed to + the LinearComponent, only in bottleneck layers (i.e. if bottleneck-dim + is supplied). """ def __init__(self, first_token, key_to_value, prev_names=None): @@ -475,13 +470,15 @@ def set_default_configs(self): # the most recent layer. self.config = {'input': '[-1]', 'dim': -1, + 'bottleneck-dim': -1, + 'orthonormal-constraint': 1.0, + # orthonormal-constraint only matters if bottleneck-dim is set. 'include-log-softmax': True, # this would be false for chain models 'objective-type': 'linear', # see Nnet::ProcessOutputNodeConfigLine in # nnet-nnet.cc for other options 'learning-rate-factor': 1.0, - 'presoftmax-scale-file': '', # used in DNN (not RNN) training when using # frame-level objfns, 'max-change': 1.5, @@ -489,7 +486,8 @@ def set_default_configs(self): 'bias-stddev': 0.0, 'l2-regularize': 0.0, 'output-delay': 0, - 'ng-affine-options': '' + 'ng-affine-options': '', + 'ng-linear-options': '' # only affects bottleneck output layers. } def check_configs(self): @@ -533,8 +531,20 @@ def output_dim(self, auxiliary_output=None): " layers") def get_full_config(self): - ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + + def _generate_config(self): + + configs = [] # note: each value of self.descriptors is (descriptor, dim, # normalized-string, output-string). @@ -543,10 +553,10 @@ def get_full_config(self): descriptor_final_string = self.descriptors['input']['final-string'] input_dim = self.descriptors['input']['dim'] output_dim = self.config['dim'] + bottleneck_dim = self.config['bottleneck-dim'] objective_type = self.config['objective-type'] learning_rate_factor = self.config['learning-rate-factor'] include_log_softmax = self.config['include-log-softmax'] - presoftmax_scale_file = self.config['presoftmax-scale-file'] param_stddev = self.config['param-stddev'] bias_stddev = self.config['bias-stddev'] l2_regularize = self.config['l2-regularize'] @@ -558,64 +568,72 @@ def get_full_config(self): l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) if l2_regularize != 0.0 else '') - # note: ref.config is used only for getting the left-context and - # right-context of the network; - # final.config is where we put the actual network definition. - for config_name in ['ref', 'final']: - # First the affine node. - line = ('component name={0}.affine' - ' type=NaturalGradientAffineComponent' - ' input-dim={1}' - ' output-dim={2}' - ' param-stddev={3}' - ' bias-stddev={4}' - ' max-change={5} {6} {7} {8}' - ''.format(self.name, input_dim, output_dim, - param_stddev, bias_stddev, max_change, ng_affine_options, - learning_rate_option, l2_regularize_option)) - ans.append((config_name, line)) - - line = ('component-node name={0}.affine' - ' component={0}.affine input={1}' - ''.format(self.name, descriptor_final_string)) - ans.append((config_name, line)) - cur_node = '{0}.affine'.format(self.name) - - if presoftmax_scale_file is not '' and config_name == 'final': - # don't use the presoftmax-scale in 'ref.config' since that - # file won't exist at the time we evaluate it. - # (ref.config is used to find the left/right context). - line = ('component name={0}.fixed-scale' - ' type=FixedScaleComponent scales={1}' - ''.format(self.name, presoftmax_scale_file)) - ans.append((config_name, line)) - - line = ('component-node name={0}.fixed-scale' - ' component={0}.fixed-scale input={1}' - ''.format(self.name, cur_node)) - ans.append((config_name, line)) - cur_node = '{0}.fixed-scale'.format(self.name) + cur_node = descriptor_final_string + cur_dim = input_dim + + if bottleneck_dim >= 0: + if bottleneck_dim == 0 or bottleneck_dim >= input_dim or bottleneck_dim >= output_dim: + raise RuntimeError("Bottleneck dim has value that does not make sense: {0}".format( + bottleneck_dim)) + # This is the bottleneck case (it doesn't necessarily imply we + # will be using the features from the bottleneck; it's just a factorization + # of the matrix into two pieces without a nonlinearity in between). + # We don't include the l2-regularize option because it's useless + # given the orthonormality constraint. + linear_options = self.config['ng-linear-options'] + + # note: by default the LinearComponent uses natural gradient. + line = ('component name={0}.linear type=LinearComponent ' + 'orthonormal-constraint={1} param-stddev={2} ' + 'input-dim={3} output-dim={4} max-change=0.75 {5}' + ''.format(self.name, self.config['orthonormal-constraint'], + self.config['orthonormal-constraint'] / math.sqrt(input_dim), + input_dim, bottleneck_dim, linear_options)) + configs.append(line) + line = ('component-node name={0}.linear component={0}.linear input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.linear'.format(self.name) + cur_dim = bottleneck_dim + + + line = ('component name={0}.affine' + ' type=NaturalGradientAffineComponent' + ' input-dim={1}' + ' output-dim={2}' + ' param-stddev={3}' + ' bias-stddev={4}' + ' max-change={5} {6} {7} {8}' + ''.format(self.name, cur_dim, output_dim, + param_stddev, bias_stddev, max_change, ng_affine_options, + learning_rate_option, l2_regularize_option)) + configs.append(line) + line = ('component-node name={0}.affine' + ' component={0}.affine input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.affine'.format(self.name) - if include_log_softmax: - line = ('component name={0}.log-softmax' - ' type=LogSoftmaxComponent dim={1}' - ''.format(self.name, output_dim)) - ans.append((config_name, line)) + if include_log_softmax: + line = ('component name={0}.log-softmax' + ' type=LogSoftmaxComponent dim={1}' + ''.format(self.name, output_dim)) + configs.append(line) - line = ('component-node name={0}.log-softmax' - ' component={0}.log-softmax input={1}' - ''.format(self.name, cur_node)) - ans.append((config_name, line)) - cur_node = '{0}.log-softmax'.format(self.name) + line = ('component-node name={0}.log-softmax' + ' component={0}.log-softmax input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.log-softmax'.format(self.name) - if output_delay != 0: - cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay) + if output_delay != 0: + cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay) - line = ('output-node name={0} input={1} ' - 'objective={2}'.format( - self.name, cur_node, objective_type)) - ans.append((config_name, line)) - return ans + line = ('output-node name={0} input={1} ' + 'objective={2}'.format( + self.name, cur_node, objective_type)) + configs.append(line) + return configs class XconfigBasicLayer(XconfigLayerBase): @@ -637,7 +655,11 @@ class XconfigBasicLayer(XconfigLayerBase): Parameters of the class, and their defaults: input='[-1]' [Descriptor giving the input of the layer.] - dim=None [Output dimension of layer, e.g. 1024] + dim=-1 [Output dimension of layer, e.g. 1024] + bottleneck-dim=-1 [If you set this, a linear bottleneck is added, so + we project to first bottleneck-dim then to dim. The + first of the two matrices is constrained to be + orthonormal.] self-repair-scale=1.0e-05 [Affects relu, sigmoid and tanh layers.] learning-rate-factor=1.0 [This can be used to make the affine component train faster or slower]. @@ -657,12 +679,16 @@ def set_default_configs(self): # the most recent layer. self.config = {'input': '[-1]', 'dim': -1, + 'bottleneck-dim': -1, 'self-repair-scale': 1.0e-05, 'target-rms': 1.0, 'ng-affine-options': '', + 'ng-linear-options': '', # only affects bottleneck layers. 'dropout-proportion': 0.5, # dropout-proportion only # affects layers with - # 'dropout' in the name. + # 'dropout' in the name + 'dropout-per-dim': False, # if dropout-per-dim=true, the dropout + # mask is shared across time. 'add-log-stddev': False, # the following are not really inspected by this level of # code, just passed through (but not if left at ''). @@ -674,6 +700,10 @@ def set_default_configs(self): def check_configs(self): if self.config['dim'] < 0: raise RuntimeError("dim has invalid value {0}".format(self.config['dim'])) + b = self.config['bottleneck-dim'] + if b >= 0 and (b >= self.config['dim'] or b == 0): + raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b)) + if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0: raise RuntimeError("self-repair-scale has invalid value {0}" .format(self.config['self-repair-scale'])) @@ -751,14 +781,41 @@ def _add_components(self, input_desc, input_dim, nonlinearities): "there is a final 'renorm' component.") configs = [] - # First the affine node. + cur_dim = input_dim + cur_node = input_desc + + # First the affine node (or linear then affine, if bottleneck). + if self.config['bottleneck-dim'] > 0: + # This is the bottleneck case (it doesn't necessarily imply we + # will be using the features from the bottleneck; it's just a factorization + # of the matrix into two pieces without a nonlinearity in between). + # We don't include the l2-regularize option because it's useless + # given the orthonormality constraint. + linear_options = self.config['ng-linear-options'] + for opt_name in [ 'max-change', 'learning-rate-factor' ]: + value = self.config[opt_name] + if value != '': + linear_options += ' {0}={1}'.format(opt_name, value) + + bottleneck_dim = self.config['bottleneck-dim'] + # note: by default the LinearComponent uses natural gradient. + line = ('component name={0}.linear type=LinearComponent ' + 'input-dim={1} orthonormal-constraint=1.0 output-dim={2} {3}' + ''.format(self.name, input_dim, bottleneck_dim, linear_options)) + configs.append(line) + line = ('component-node name={0}.linear component={0}.linear input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.linear'.format(self.name) + cur_dim = bottleneck_dim + + line = ('component name={0}.affine type=NaturalGradientAffineComponent' ' input-dim={1} output-dim={2} {3}' - ''.format(self.name, input_dim, output_dim, affine_options)) + ''.format(self.name, cur_dim, output_dim, affine_options)) configs.append(line) - line = ('component-node name={0}.affine component={0}.affine input={1}' - ''.format(self.name, input_desc)) + ''.format(self.name, cur_node)) configs.append(line) cur_node = '{0}.affine'.format(self.name) @@ -797,8 +854,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities): elif nonlinearity == 'batchnorm': line = ('component name={0}.{1}' - ' type=BatchNormComponent dim={2}' - ' target-rms={3}' + ' type=BatchNormComponent dim={2} target-rms={3}' ''.format(self.name, nonlinearity, output_dim, target_rms)) @@ -808,10 +864,31 @@ def _add_components(self, input_desc, input_dim, nonlinearities): ''.format(self.name, nonlinearity, output_dim)) elif nonlinearity == 'dropout': - line = ('component name={0}.{1} type=DropoutComponent ' - 'dim={2} dropout-proportion={3}'.format( - self.name, nonlinearity, output_dim, - self.config['dropout-proportion'])) + if not self.config['dropout-per-dim']: + line = ('component name={0}.{1} type=DropoutComponent ' + 'dim={2} dropout-proportion={3}'.format( + self.name, nonlinearity, output_dim, + self.config['dropout-proportion'])) + else: + line = ('component name={0}.dropout_mask type=DropoutMaskComponent ' + 'output-dim={1} dropout-proportion={2}'.format( + self.name, output_dim, self.config['dropout-proportion'])) + configs.append(line) + # note: the input to the dropout_mask component is never used, it's + # just syntactically required. + line = ('component-node name={0}.dropout_mask component={0}.dropout_mask ' + 'input={1}'.format(self.name, cur_node)) + configs.append(line) + line = ('component name={0}.dropout type=ElementwiseProductComponent ' + 'input-dim={1} output-dim={2} '.format( + self.name, 2 * output_dim, output_dim)) + configs.append(line) + line = ('component-node name={0}.dropout component={0}.dropout ' + 'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.dropout'.format(self.name) + continue else: raise RuntimeError("Unknown nonlinearity type: {0}" diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 9743d0100b9..a7808131a4a 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -103,7 +103,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -113,7 +113,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name @@ -258,6 +258,8 @@ def generate_lstm_config(self): # This class is for lines like # 'lstmp-layer name=lstm1 input=[-1] delay=-3' +# (you can also use the name 'lstmp-batchnorm-layer' if you want it to be followed +# by batchnorm). # It generates an LSTM sub-graph with output projections. It can also generate # outputs without projection, but you could use the XconfigLstmLayer for this # simple LSTM. @@ -292,7 +294,9 @@ def generate_lstm_config(self): # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigLstmpLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == "lstmp-layer" + # lstmp-batchnorm-layer is like lstmp-layer but followed by a batchnorm + # component. + assert first_token in ["lstmp-layer", "lstmp-batchnorm-layer"] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): @@ -353,7 +357,8 @@ def auxiliary_outputs(self): return ['c_t'] def output_name(self, auxiliary_output = None): - node_name = 'rp_t' + node_name = ( 'rp_t_batchnorm' if self.layer_type == 'lstmp-batchnorm-layer' + else 'rp_t' ) if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output @@ -375,7 +380,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -385,7 +390,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name @@ -542,18 +547,27 @@ def generate_lstm_config(self): configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}" "".format(name, rec_proj_dim, bptrunc_str)) - configs.append("# r_t and p_t : rp_t will be the output") + configs.append("# r_t and p_t : rp_t will be the output (if we're not doing batchnorm)") configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t" "".format(name)) configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 " "dim={1}".format(name, rec_proj_dim)) configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name)) + if self.layer_type == "lstmp-batchnorm-layer": + # Add the batchnorm component, if requested to include batchnorm. + configs.append("component name={0}.rp_t_batchnorm type=BatchNormComponent dim={1} ".format( + name, rec_proj_dim + nonrec_proj_dim)) + configs.append("component-node name={0}.rp_t_batchnorm component={0}.rp_t_batchnorm " + "input={0}.rp_t".format(name)) + return configs # This class is for lines like # 'fast-lstm-layer name=lstm1 input=[-1] delay=-3' +# (you can also use the name 'fast-lstm-batchnorm-layer' if you want it to be followed +# by batchnorm). # It generates an LSTM sub-graph without output projections. # Unlike 'lstm-layer', the core nonlinearities of the LSTM are done in a special-purpose # component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined @@ -586,7 +600,7 @@ def generate_lstm_config(self): # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigFastLstmLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == "fast-lstm-layer" + assert first_token in ["fast-lstm-layer", "fast-lstm-batchnorm-layer"] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): @@ -626,7 +640,8 @@ def auxiliary_outputs(self): return ['c'] def output_name(self, auxiliary_output = None): - node_name = 'm' + node_name = ('m_batchnorm' if self.layer_type == 'fast-lstm-batchnorm-layer' + else 'm') if auxiliary_output is not None: if auxiliary_output == 'c': node_name = 'c' @@ -647,7 +662,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -657,7 +672,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name @@ -697,43 +712,62 @@ def generate_lstm_config(self): # providing output to gate i and operating on an appended vector [x,r] configs.append("### Begin LTSM layer '{0}'".format(name)) configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.") + configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4, affine_str, l2_regularize_option)) + configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent " "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str, l2_regularize_option)) + configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - # Note from Dan: I don't remember why we are applying the backprop - # truncation on both c and m appended together, instead of just on c. - # Possibly there was some memory or speed or WER reason for it which I - # have forgotten about now. - configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str)) + configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} " + "{2}".format(name, 2 * cell_dim, bptrunc_str)) configs.append("### Nodes for the components above.") - configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, " - "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " + "IfDefined(Offset({0}.m_trunc, {2})))".format( + name, input_descriptor, delay)) + configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) + "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format( + name, delay)) # we can print .c later if needed, but it generates a warning since it's not used. could use c_trunc instead #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim)) - # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) + + if self.layer_type == "fast-lstm-batchnorm-layer": + # Add the batchnorm component, if requested to include batchnorm. + configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( + name, cell_dim)) + configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " + "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) return configs # This class is for lines like -# 'fast-lstmb-layer name=lstm1 input=[-1] delay=-3' -# It's like fast-lstm-layer but with a bottleneck (like an SVD) in the main parameter matrix -# of the LSTM (W_all, which combines all the full-rank projections of the LSTM): we divide -# it into two matrices, with batch-norm in between to stabilize the training. +# 'lstmb-layer name=lstm1 input=[-1] delay=-3' +# +# LSTMB is not something we've published; it's LSTM with a bottleneck in the +# middle of the W_all matrix (where W_all is a matrix that combines the 8 full +# matrices of standard LSTM). W_all is factored into W_all_a and W_all_b, where +# W_all_a is constrained to have orthonormal rows (this keeps it training stably). +# +# It also contains a couple of other improvements: W_all_b is followed by +# trainable ScaleAndOffsetComponent (this is a bit like the idea from the +# publication "Self-stabilized deep neural network" by Ghahramani et al). +# And the LSTM is followed by a batchnorm component (this is by default; it's not +# part of the layer name, like lstmb-batchnorm-layer). + # # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, # the dimension defaults to the same as the input. @@ -761,32 +795,30 @@ def generate_lstm_config(self): # i.e. history since about t = t-20, can be # accumulated in c_t.] # l2-regularize=0.0 Constant controlling l2 regularization for this layer -class XconfigFastLstmbLayer(XconfigLayerBase): +class XconfigLstmbLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == "fast-lstmb-layer" + assert first_token == 'lstmb-layer' XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', + self.config = { 'input':'[-1]', 'cell-dim' : -1, # this is a required argument 'bottleneck-dim': -1, # this is a required argument - 'clipping-threshold' : 30.0, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, + 'clipping-threshold': 30.0, + 'zeroing-interval': 20, + 'zeroing-threshold': 15.0, + 'orthonormal-constraint': 1.0, 'delay' : -1, - # if you want to set 'self-repair-scale' (c.f. the - # self-repair-scale-nonlinearity config value in older LSTM layers), you can - # add 'self-repair-scale=xxx' to - # lstm-nonlinearity-options. 'lstm-nonlinearity-options' : ' max-change=0.75', + # the recurrence scale is the scale on m_trunc, used in the + # recurrence (to balance its size with the input). + 'self-scale' : 1.0, # the affine layer contains 4 of our old layers -> use a # larger max-change than the normal value of 0.75. 'ng-affine-options' : ' max-change=1.5', - 'normalize-type': 'batchnorm', # can be 'batchnorm', 'renorm', or 'none' 'l2-regularize': 0.0, 'decay-time': -1.0 } - self.c_needed = False # keep track of whether the 'c' output is needed. def set_derived_configs(self): if self.config['cell-dim'] <= 0: @@ -801,34 +833,21 @@ def check_configs(self): self.config['bottleneck-dim'])) if self.config['delay'] == 0: raise RuntimeError("delay cannot be zero") - assert self.config['normalize-type'] in ['batchnorm', 'renorm', 'none'] def auxiliary_outputs(self): - return ['c'] + return [] def output_name(self, auxiliary_output = None): - node_name = 'm' - if auxiliary_output is not None: - if auxiliary_output == 'c': - node_name = 'c' - self.c_needed = True - else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) - return '{0}.{1}'.format(self.name, node_name) + assert auxiliary_output is None + return '{0}.m_batchnorm'.format(self.name) def output_dim(self, auxiliary_output = None): - if auxiliary_output is not None: - if auxiliary_output == 'c': - self.c_needed = True - return self.config['cell-dim'] - # add code for other auxiliary_outputs here when we decide to expose them - else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) + assert auxiliary_output is None return self.config['cell-dim'] def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -838,7 +857,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name @@ -847,6 +866,7 @@ def generate_lstm_config(self): input_descriptor = self.descriptors['input']['final-string'] cell_dim = self.config['cell-dim'] bottleneck_dim = self.config['bottleneck-dim'] + self_scale = self.config['self-scale'] delay = self.config['delay'] affine_str = self.config['ng-affine-options'] l2_regularize = self.config['l2-regularize'] @@ -872,23 +892,25 @@ def generate_lstm_config(self): configs = [] - # See XconfigFastLstmLayer to understand what's going on here. - # This differs from that code by a factorization of the W_all matrix. + # See XconfigFastLstmLayer to understand what's going on here. This + # differs from that code by a factorization of the W_all matrix into two + # pieces with a smaller dimension in between (with the first of the two + # pieces constrained to have orthonormal rows). Note: we don't apply l2 + # regularization to this layer, since, with the orthonormality + # constraint, it's meaningless. configs.append("### Begin LTSM layer '{0}'".format(name)) configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, bottleneck_dim, - affine_str, l2_regularize_option)) - normalize_type = self.config['normalize-type'] - if normalize_type == 'batchnorm': - configs.append("component name={0}.W_batchnorm type=BatchNormComponent dim={1} ".format( - name, bottleneck_dim)) - elif normalize_type == 'renorm': - configs.append("component name={0}.W_renorm type=NormalizeComponent dim={1} ".format( - name, bottleneck_dim)) - - configs.append("component name={0}.W_all_b type=NaturalGradientAffineComponent input-dim={1} " + "orthonormal-constraint={2} output-dim={3} {4}".format( + name, input_dim + cell_dim, + self.config['orthonormal-constraint'], + bottleneck_dim, affine_str)) + + configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, affine_str, l2_regularize_option)) + configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} " + "max-change=0.75".format(name, cell_dim * 4)) + configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") @@ -897,32 +919,33 @@ def generate_lstm_config(self): "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str, l2_regularize_option)) configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - # Note from Dan: I don't remember why we are applying the backprop - # truncation on both c and m appended together, instead of just on c. - # Possibly there was some memory or speed or WER reason for it which I - # have forgotten about now. - configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str)) + + configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format( + name, 2 * cell_dim, bptrunc_str)) + configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( + name, cell_dim)) configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " - "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay)) - if normalize_type != 'none': - configs.append("component-node name={0}.W_{1} component={0}.W_{1} " - "input={0}.W_all_a".format(name, - normalize_type)) - configs.append("component-node name={0}.W_all_b component={0}.W_all_b " - "input={0}.W_{1}".format(name, normalize_type)) - else: - configs.append("component-node name={0}.W_all_b component={0}.W_all_b " - "input={0}.W_all_a".format(name)) + "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format( + name, input_descriptor, self_scale, delay)) + configs.append("component-node name={0}.W_all_b component={0}.W_all_b " + "input={0}.W_all_a".format(name)) + configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so " + "input={0}.W_all_b".format(name)) + configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.W_all_b, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) - # we can print .c later if needed, but it generates a warning since it's not used. could use c_trunc instead - #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) + "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format( + name, delay)) + configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " + "dim={1}".format(name, cell_dim)) configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim)) - # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 " + "dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} " + "dim={1}".format(name, cell_dim)) + configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " + "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) return configs @@ -933,6 +956,8 @@ def generate_lstm_config(self): # 'fast-lstmp-layer name=lstm1 input=[-1] delay=-3' # or: # 'fast-lstmp-layer name=lstm1 input=[-1] delay=-3 cell-dim=1024 recurrent-projection-dim=512 non-recurrent-projection-dim=512' +# (you can also use the name 'fast-lstmp-batchnorm-layer' if you want it to be followed +# by batchnorm). # It generates an LSTM sub-graph with output projections (i.e. a projected LSTM, AKA LSTMP). # Unlike 'lstmp-layer', the core nonlinearities of the LSTM are done in a special-purpose # component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined @@ -968,7 +993,7 @@ def generate_lstm_config(self): # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigFastLstmpLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == "fast-lstmp-layer" + assert first_token in ['fast-lstmp-layer', 'fast-lstmp-batchnorm-layer'] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): @@ -1026,7 +1051,8 @@ def auxiliary_outputs(self): return ['c_t'] def output_name(self, auxiliary_output = None): - node_name = 'rp' + node_name = ('rp_batchnorm' if self.layer_type == 'fast-lstmp-batchnorm-layer' + else 'rp') if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output @@ -1048,7 +1074,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -1058,8 +1084,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): - + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness @@ -1104,8 +1129,9 @@ def generate_lstm_config(self): configs.append("## Begin LTSM layer '{0}'".format(name)) configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.") configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4, - affine_str, l2_regularize_option)) + "output-dim={2} {3} {4}".format( + name, input_dim + rec_proj_dim, cell_dim * 4, + affine_str, l2_regularize_option)) configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") @@ -1123,29 +1149,32 @@ def generate_lstm_config(self): .format(name, dropout_proportion)) configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent"); configs.append("# and non-recurrent projections") - configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format( + configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent " + "input-dim={1} output-dim={2} {3} {4}".format( name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str, l2_regularize_option)) configs.append("### Nodes for the components above.") - configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, " + configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay)) + if dropout_proportion != -1.0: # note: the 'input' is a don't-care as the component never uses it; it's required # in component-node lines. configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask " "input={0}.dropout_mask".format(name)) configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})), {0}.dropout_mask)" - .format(name, delay)) + "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})), " + "{0}.dropout_mask)".format(name, delay)) else: configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) + "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format( + name, delay)) configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin " "dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin " "dim-offset={1} dim={1}".format(name, cell_dim)) - configs.append("# {0}.rp is the output node of this layer:".format(name)) + configs.append("# {0}.rp is the output node of this layer (if we're not " + "including batchnorm)".format(name)) configs.append("component-node name={0}.rp component={0}.W_rp input={0}.m".format(name)) configs.append("dim-range-node name={0}.r input-node={0}.rp dim-offset=0 " "dim={1}".format(name, rec_proj_dim)) @@ -1158,6 +1187,12 @@ def generate_lstm_config(self): "dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc " "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) + if self.layer_type == "fast-lstmp-batchnorm-layer": + # Add the batchnorm component, if requested to include batchnorm. + configs.append("component name={0}.rp_batchnorm type=BatchNormComponent dim={1} ".format( + name, rec_proj_dim + nonrec_proj_dim)) + configs.append("component-node name={0}.rp_batchnorm component={0}.rp_batchnorm " + "input={0}.rp".format(name)) configs.append("### End LSTM Layer '{0}'".format(name)) return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 7ab70027cef..6fbde1fbbcc 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -34,9 +34,12 @@ 'affine-layer' : xlayers.XconfigAffineLayer, 'lstm-layer' : xlayers.XconfigLstmLayer, 'lstmp-layer' : xlayers.XconfigLstmpLayer, + 'lstmp-batchnorm-layer' : xlayers.XconfigLstmpLayer, 'fast-lstm-layer' : xlayers.XconfigFastLstmLayer, + 'fast-lstm-batchnorm-layer' : xlayers.XconfigFastLstmLayer, 'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer, - 'fast-lstmb-layer' : xlayers.XconfigFastLstmbLayer, + 'fast-lstmp-batchnorm-layer' : xlayers.XconfigFastLstmpLayer, + 'lstmb-layer' : xlayers.XconfigLstmbLayer, 'stats-layer': xlayers.XconfigStatsLayer, 'relu-conv-layer': xlayers.XconfigConvLayer, 'conv-layer': xlayers.XconfigConvLayer, @@ -65,7 +68,9 @@ 'opgru-layer' : xlayers.XconfigOpgruLayer, 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, - 'renorm-component': xlayers.XconfigRenormComponent + 'renorm-component': xlayers.XconfigRenormComponent, + 'no-op-component': xlayers.XconfigNoOpComponent, + 'linear-component': xlayers.XconfigLinearComponent } # Turn a config line and a list of previous layers into diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index 80a2b7df418..63f6278d1ca 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -68,3 +68,132 @@ def _generate_config(self): self.name, input_desc)) configs.append(line) return configs + + +class XconfigNoOpComponent(XconfigLayerBase): + """This class is for parsing lines like + 'no-op-component name=renorm input=Append(-3,0,3)' + which will produce just a single component, of type NoOpComponent. + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]' } + + def check_configs(self): + pass + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + + configs = [] + line = ('component name={0} type=NoOpComponent dim={1}'.format( + self.name, input_dim)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs + + +class XconfigLinearComponent(XconfigLayerBase): + """This class is for parsing lines like + 'linear-component name=linear1 dim=1024 input=Append(-3,0,3)' + which will produce just a single component, of type LinearComponent, with + output-dim 1024 in this case, and input-dim determined by the dimention + of the input . + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + dim=-1 [Dimension of the output] + + The following (shown with their effective defaults) are just passed through + to the component's config line. + + orthonormal-constraint=-1 + max-change=0.75 + l2-regularize=0.0 + + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'dim': -1, + 'orthonormal-constraint': '', + 'max-change': 0.75, + 'l2-regularize': '' } + + def check_configs(self): + if self.config['dim'] <= 0: + raise RuntimeError("'dim' must be specified and > 0.") + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + assert self.config['dim'] > 0 + return self.config['dim'] + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.config['dim'] + + opts = '' + for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize']: + value = self.config[opt_name] + if value != '': + opts += ' {0}={1}'.format(opt_name, value) + + configs = [] + line = ('component name={0} type=LinearComponent input-dim={1} output-dim={2} ' + '{3}'.format(self.name, input_dim, output_dim, opts)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index 9ff7f1e2258..08de18167cd 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -6,6 +6,7 @@ # while xconfig_layers.py contains the code specific to layer types. from __future__ import print_function +from __future__ import division import re import sys @@ -277,6 +278,12 @@ def dim(self, layer_to_dim): return self.items[0].dim(layer_to_dim) elif self.operator == 'Append': return sum([ x.dim(layer_to_dim) for x in self.items]) + elif self.operator == 'Scale': + # e.g. Scale(2.0, lstm1). Return dim of 2nd arg. + return self.items[1].dim(layer_to_dim) + elif self.operator == 'Const': + # e.g. Const(0.5, 512). Return 2nd arg, which is an int. + return self.items[1] else: raise RuntimeError("Unknown operator {0}".format(self.operator)) @@ -312,7 +319,8 @@ def parse_new_descriptor(tokens, pos, prev_names): # when reading this function, be careful to note the indent level, # there is an if-statement within an if-statement. - if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]: + if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', + 'Switch', 'Failover', 'IfDefined' ]: expect_token('(', tokens[pos], first_token + '()') pos += 1 d.operator = first_token @@ -392,6 +400,38 @@ def parse_new_descriptor(tokens, pos, prev_names): pos += 1 else: raise RuntimeError("code error") + elif first_token in ['Scale', 'Const' ]: + # Parsing something like 'Scale(2.0, lstm1)' or 'Const(1.0, 512)' + expect_token('(', tokens[pos], first_token + '()') + pos += 1 + d.operator = first_token + # First arg of Scale() and Const() is a float: the scale or value, + # respectively. + try: + value = float(tokens[pos]) + pos += 1 + d.items = [value] + except: + raise RuntimeError("Parsing {0}, expected float, got {1}".format( + first_token, tokens[pos])) + # Consume the comma. + expect_token(',', tokens[pos], first_token + '()') + pos += 1 + if first_token == 'Scale': + # Second arg of Scale() is a Descriptor. + (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) + d.items.append(desc) + else: + assert first_token == 'Const' + try: + dim = int(tokens[pos]) + pos += 1 + d.items.append(dim) + except: + raise RuntimeError("Parsing Const() expression, expected int, got {0}".format( + tokens[pos])) + expect_token(')', tokens[pos], first_token) + pos += 1 elif first_token in [ 'end of string', '(', ')', ',', '@' ]: raise RuntimeError("Expected descriptor, got " + first_token) elif is_valid_line_name(first_token) or first_token == '[': @@ -555,7 +595,7 @@ def parse_config_line(orig_config_line): rest_of_line = ' '.join(fields) # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)' - positions = map(lambda x: x.start(), re.finditer('"', rest_of_line)) + positions = list(map(lambda x: x.start(), re.finditer('"', rest_of_line))) if not len(positions) % 2 == 0: raise RuntimeError("Double-quotes should occur in pairs") @@ -565,7 +605,7 @@ def parse_config_line(orig_config_line): # and replace the quotation marks themselves with spaces. # Then later on we'll convert all the question marks to # equals signs in the values in the dicts. - num_strings = len(positions) / 2 + num_strings = len(positions) // 2 fields = [] for i in range(num_strings): start = positions[i * 2] @@ -588,7 +628,7 @@ def parse_config_line(orig_config_line): if not (other_fields[0] == '' and len(other_fields) % 2 == 1): raise RuntimeError("Could not parse config line."); fields += other_fields[1:] - num_variables = len(fields) / 2 + num_variables = len(fields) // 2 for i in range(num_variables): var_name = fields[i * 2] var_value = fields[i * 2 + 1] @@ -634,6 +674,8 @@ def test_library(): ('Append(-3,0,3)', 'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'), ('[-1]', 'prev_layer'), + ('Scale(2.0,foo)', 'Scale(2.0, foo)'), + ('Const(0.5,500)', 'Const(0.5, 500)'), ('[-2]', 'last_but_one_layer'), ('[-2]@3', 'Offset(last_but_one_layer, 3)') ]: diff --git a/egs/wsj/s5/steps/make_phone_graph.sh b/egs/wsj/s5/steps/make_phone_graph.sh index 817f7d1f10b..aaf88cc66d2 100755 --- a/egs/wsj/s5/steps/make_phone_graph.sh +++ b/egs/wsj/s5/steps/make_phone_graph.sh @@ -8,6 +8,7 @@ # is to be used for segmentation, and uses that together with a model to # make a decoding graph. # Uses SRILM. +# See also utils/lang/make_phone_bigram_lm.sh. # Begin configuration section. stage=0 diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index cec6f8e166f..0294df0d84a 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -150,6 +150,8 @@ if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. rm $dir/uniq2utt $dir/valid_uttlist.tmp fi +echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete" + cat $data/utt2dur | \ awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ utils/filter_scp.pl --exclude $dir/valid_uttlist | \ diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 6896da67f73..144d29641fd 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -216,15 +216,12 @@ def process_args(args): "--trainer.deriv-truncate-margin.".format( args.deriv_truncate_margin)) - if (not os.path.exists(args.dir) - or (not os.path.exists(args.dir+"/configs") and - (args.input_model is None or not os.path.exists(args.input_model)))): - raise Exception("This script expects {0} to exist. Also either " - "--trainer.input-model option as initial 'raw' model " - "(used as 0.raw in the script) should be supplied or " - "{0}/configs directory which is the output of " - "make_configs.py script should be provided." - "".format(args.dir)) + if (not os.path.exists(args.dir)): + raise Exception("This script expects --dir={0} to exist.") + if (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model))): + raise Exception("Either --trainer.input-model option should be supplied, " + "and exist; or the {0}/configs directory should exist.") if args.transform_dir is None: args.transform_dir = args.lat_dir @@ -274,6 +271,10 @@ def train(args, run_opts): chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, args.lat_dir) + # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will + # use it to check compatibility between training and decoding phone-sets. + shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir) + # Set some variables. num_jobs = common_lib.get_number_of_jobs(args.tree_dir) feat_dim = common_lib.get_feat_dim(args.feat_dir) @@ -505,6 +506,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, apply_deriv_weights=args.apply_deriv_weights, @@ -522,7 +524,7 @@ def train(args, run_opts): backstitch_training_interval=args.backstitch_training_interval) if args.cleanup: - # do a clean up everythin but the last 2 models, under certain + # do a clean up everything but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, @@ -573,8 +575,9 @@ def train(args, run_opts): # delete it remove_egs = False + # leave the last-two-numbered models, for diagnostic reasons. common_train_lib.clean_nnet_dir( - args.dir, num_iters, egs_dir, + args.dir, num_iters - 1, egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs) @@ -588,7 +591,7 @@ def train(args, run_opts): with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) - common_lib.execute_command("steps/info/nnet3_dir_info.pl " + common_lib.execute_command("steps/info/chain_dir_info.pl " "{0}".format(args.dir)) diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh index 4ba8cae2d56..f5340fb4611 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh @@ -158,6 +158,9 @@ for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \ [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done +# Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will +# use it to check compatibility between training and decoding phone-sets. +cp $treedir/phones.txt $dir # Set some variables. nj=`cat $treedir/num_jobs` || exit 1; # number of jobs in alignment dir... diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 50e02629db0..8c520e0b5e1 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -70,6 +70,8 @@ if [ ! -z "$online_ivector_dir" ]; then extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" fi +utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1 + for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index a6dd9682616..c8cbf67c8b8 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -138,6 +138,8 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis [ -z "$transform_dir" ] && transform_dir=$alidir +echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete" + # because we'll need the features with a different number of jobs than $alidir, # copy to ark,scp. if [ -f $transform_dir/raw_trans.1 ]; then diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index 073ad3e7d7a..e21fdb9f43e 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -162,6 +162,10 @@ def train(args, run_opts): arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) + # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will + # use it to check compatibility between training and decoding phone-sets. + shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir) + # Set some variables. # num_leaves = common_lib.get_number_of_leaves_from_tree(args.ali_dir) num_jobs = common_lib.get_number_of_jobs(args.ali_dir) @@ -328,6 +332,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), minibatch_size_str=args.minibatch_size, frames_per_eg=args.frames_per_eg, momentum=args.momentum, @@ -365,16 +370,16 @@ def train(args, run_opts): egs_dir=egs_dir, minibatch_size_str=args.minibatch_size, run_opts=run_opts, max_objective_evaluations=args.max_objective_evaluations) - + if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " "adjusting the priors.") - + # If args.do_final_combination is true, we will use the combined model. # Otherwise, we will use the last_numbered model. real_iter = 'combined' if args.do_final_combination else num_iters avg_post_vec_file = train_lib.common.compute_average_posterior( - dir=args.dir, iter=real_iter, + dir=args.dir, iter=real_iter, egs_dir=egs_dir, num_archives=num_archives, prior_subset_size=args.prior_subset_size, run_opts=run_opts) diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py index 2d092ceebc7..d5b37871d70 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -356,6 +356,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), minibatch_size_str=args.minibatch_size, frames_per_eg=args.frames_per_eg, momentum=args.momentum, diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py index b51632e7d2c..686b76aa7db 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py @@ -432,6 +432,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, minibatch_size_str=args.num_chunk_per_minibatch, min_deriv_time=min_deriv_time, diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 005e751cae0..1d2135c90c2 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -218,6 +218,10 @@ def train(args, run_opts): arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) + # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will + # use it to check compatibility between training and decoding phone-sets. + shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir) + # Set some variables. num_jobs = common_lib.get_number_of_jobs(args.ali_dir) feat_dim = common_lib.get_feat_dim(args.feat_dir) @@ -410,6 +414,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, minibatch_size_str=args.num_chunk_per_minibatch, min_deriv_time=min_deriv_time, diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh index fbcf426b205..f023d38b26c 100755 --- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh @@ -148,6 +148,9 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/ [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done +# Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will +# use it to check compatibility between training and decoding phone-sets. +cp $alidir/phones.txt $dir # Set some variables. num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1 diff --git a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh index 50191cf90cb..59ae4a4c994 100755 --- a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh +++ b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh @@ -6,6 +6,11 @@ # 2015 Hainan Xu +# The thing that this script implements is described in the paper: +# "PRONUNCIATION AND SILENCE PROBABILITY MODELING FOR ASR" +# by Guoguo Chen et al, see +# http://www.danielpovey.com/files/2015_interspeech_silprob.pdf + . ./path.sh || exit 1; # begin configuration @@ -73,7 +78,7 @@ fi # the cat and awk commands below are implementing add-one smoothing. cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \ awk '{ count = $1; $1 = ""; word_count[$2] += count; pron_count[$0] += count; pron2word[$0] = $2; } - END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; + END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; print num / den, p } } ' | \ awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^' |\ sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt @@ -108,6 +113,11 @@ fi # Create $dir/lexiconp_silprob.txt and $dir/silprob.txt if silence counts file # exists. The format of $dir/lexiconp_silprob.txt is: # word pron-prob P(s_r | w) F(s_l | w) F(n_l | w) pron +# where: P(s_r | w) is the probability of silence to the right of the word +# F(s_l | w) is a factor which is greater than one if silence to the +# left of the word is more than averagely probable. +# F(n_l | w) is a factor which is greater than one if nonsilence to the +# left of the word is more than averagely probable. if [ -n "$sil_counts" ]; then if [ ! -s "$sil_counts" ]; then echo "$0: expected file $sil_counts to exist and not empty" && exit 1; @@ -175,7 +185,7 @@ if [ -n "$sil_counts" ]; then # Computes F(s_l | w) and F(n_l | w) in the paper. $lambda3 = 2; # Smoothing term, \lambda_3 in the paper. foreach my $wpron (keys %all_wprons) { - @col = split(" ", $wpron); + @col = split(" ", $wpron); $word = shift @col; $pron = join(" ", @col); $pron_prob = $all_wprons{$wpron}; @@ -189,7 +199,7 @@ if [ -n "$sil_counts" ]; then print LPSP "$word $pron_prob $P_w_sr{$wpron} $F_sl_w $F_nl_w $pron\n"; } - + # Create silprob.txt $BOS_sil_count = $wpron_sil{""} + $sil_prob * $lambda2; $BOS_nonsil_count = $wpron_nonsil{""} + (1 - $sil_prob) * $lambda2; @@ -206,7 +216,7 @@ if [ -n "$sil_counts" ]; then fi # now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are -# in the same order. +# in the same order. cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/ / /g' >$dir/lexicon.txt diff --git a/egs/wsj/s5/utils/lang/check_phones_compatible.sh b/egs/wsj/s5/utils/lang/check_phones_compatible.sh index 18301a900c5..cfad06d2b8c 100755 --- a/egs/wsj/s5/utils/lang/check_phones_compatible.sh +++ b/egs/wsj/s5/utils/lang/check_phones_compatible.sh @@ -18,11 +18,8 @@ # except for possible differences in disambiguation symbols (meaning that all # symbols except those beginning with a # are mapped to the same values). # Otherwise it prints a warning and exits with status 1. -# For the sake of compatibility with other scripts that did not write the -# phones.txt to model directories, this script exits silently with status 0 -# if one of the phone symbol tables does not exist. -# For the sake of compatibility with other scripts that did not write the -# phones.txt to model directories, this script exits silently with status 0 +# For the sake of compatibility with other scripts that did not write the +# phones.txt to model directories, this script exits silently with status 0 # if one of the phone symbol tables does not exist. . utils/parse_options.sh || exit 1; @@ -36,24 +33,24 @@ fi table_first=$1 table_second=$2 -# check the files exist or not +# check if the files exist or not if [ ! -f $table_first ]; then if [ ! -f $table_second ]; then echo "$0: Error! Both of the two phones-symbol tables are absent." echo "Please check your command" exit 1; else - #The phones-symbol-table1 is absent. The model directory maybe created by old script. - #For back compatibility, this script exits silently with status 0. + # The phones-symbol-table1 is absent. The model directory maybe created by old script. + # For back compatibility, this script exits silently with status 0. exit 0; fi elif [ ! -f $table_second ]; then - #The phones-symbol-table2 is absent. The model directory maybe created by old script. - #For back compatibility, this script exits silently with status 0. + # The phones-symbol-table2 is absent. The model directory maybe created by old script. + # For back compatibility, this script exits silently with status 0. exit 0; fi -#Check the two tables are same or not (except for possible difference in disambiguation symbols). +# Check if the two tables are the same (except for possible difference in disambiguation symbols). if ! cmp -s <(grep -v "^#" $table_first) <(grep -v "^#" $table_second); then echo "$0: phone symbol tables $table_first and $table_second are not compatible." exit 1; diff --git a/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh index dcb77bb1342..1d3d04896b4 100755 --- a/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh +++ b/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh @@ -9,10 +9,10 @@ # is to limit the number of transitions, so we can decode reasonably fast, and the # graph won't blow up. This is probably going to be most useful for things like # language-id. +# +# See also steps/make_phone_graph.sh -# We might later have options here; if not, I'll emove this. - echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. diff --git a/src/.version b/src/.version index d346e2ab7f2..37c2d9960ec 100644 --- a/src/.version +++ b/src/.version @@ -1 +1 @@ -5.3 +5.4 diff --git a/src/INSTALL b/src/INSTALL index f40a514c4b6..d794cab67ee 100644 --- a/src/INSTALL +++ b/src/INSTALL @@ -9,21 +9,12 @@ You must first have completed the installation steps in ../tools/INSTALL The installation instructions are ./configure --shared - make depend - make - -Note that "make" takes a long time. You can speed it up by running make -in parallel if you have multiple CPUs, e.g. to use 8 CPUs - make depend -j 8 make -j 8 -Kaldi requires a relatively recent C++ compiler with C++11 support, -e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system -default compiler does not support C++11, you can specify a C++11 compliant -compiler by setting the CXX environment variable, e.g. - - CXX=g++-4.8 ./configure --shared +Note that we added the "-j 8" to run in parallel because "make" takes a long +time. 8 jobs might be too many for a laptop or small desktop machine with not +many cores. For more information, see documentation at http://kaldi-asr.org/doc/ and click on "The build process (how Kaldi is compiled)". diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 2b27d4b9176..620ea873eb7 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -260,7 +260,7 @@ bool DenominatorComputation::Backward( BetaGeneralFrameDebug(t); Beta(t); if (t % kMaxDerivTimeSteps == 0) { - // commit the derivative stored in exp_nnet_output_transposed_ by adding + // commit the derivative stored in nnet_output_deriv_transposed_ by adding // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. int32 chunk_frames = std::min(static_cast(kMaxDerivTimeSteps), frames_per_sequence_ - t), diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h index a4a417c8a5d..f44588e434f 100644 --- a/src/chain/chain-denominator.h +++ b/src/chain/chain-denominator.h @@ -51,7 +51,7 @@ namespace chain { All this is done in parallel over multiple sequences, but the computations are independent over the separate sequences, so we won't introduce any notation - or index for the sequence; we'll just explain it for one sequences. + or index for the sequence; we'll just explain it for one sequence. Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for hmm-state indexes). Let foll(i) give a list of arcs leaving state i, and @@ -313,4 +313,3 @@ class DenominatorComputation { } // namespace kaldi #endif // KALDI_CHAIN_CHAIN_DENOMINATOR_H_ - diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index 7bf3c17854a..d14c80cd84f 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -607,8 +607,8 @@ void TestRanges() { int main() { using namespace kaldi; SetVerboseLevel(1); - int32 loop = 0; #if HAVE_CUDA == 1 + int32 loop = 0; for (loop = 0; loop < 2; loop++) { CuDevice::Instantiate().SetDebugStrideMode(true); if (loop == 0) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 53de69a0e07..bf61bed67f0 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -33,38 +33,45 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv) { - BaseFloat num_logprob_weighted; - if (nnet_output_deriv) + CuMatrix *xent_output_deriv) { + BaseFloat num_logprob_weighted, den_logprob_weighted; + bool ok = true; + if (nnet_output_deriv != NULL) nnet_output_deriv->SetZero(); + + { // Doing the denominator first helps to reduce the maximum + // memory use, as we can set 'xent_deriv' to nonempty after + // we've freed the memory in this object. + DenominatorComputation denominator(opts, den_graph, + supervision.num_sequences, + nnet_output); + + den_logprob_weighted = supervision.weight * denominator.Forward(); + if (nnet_output_deriv) + ok = denominator.Backward(-supervision.weight, + nnet_output_deriv); + } + + if (xent_output_deriv != NULL) + xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols()); + + { NumeratorComputation numerator(supervision, nnet_output); // note: supervision.weight is included as a factor in the derivative from - // the numerator object, and the logprob too. + // the numerator object, as well as the returned logprob. num_logprob_weighted = numerator.Forward(); - if (nnet_output_deriv) { - numerator.Backward(nnet_output_deriv); - if (xent_output_deriv) - xent_output_deriv->CopyFromMat(*nnet_output_deriv); - } else if (xent_output_deriv) { - // this branch will be taken if xent_output_deriv but not - // nnet_output_deriv is set- which could happen if you want to compute the - // cross-entropy objective but not the derivatives. - xent_output_deriv->SetZero(); + + if (xent_output_deriv) { numerator.Backward(xent_output_deriv); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(1.0, *xent_output_deriv); + } else if (nnet_output_deriv) { + numerator.Backward(nnet_output_deriv); } } - DenominatorComputation denominator(opts, den_graph, - supervision.num_sequences, - nnet_output); - - BaseFloat den_logprob = denominator.Forward(); - bool ok = true; - if (nnet_output_deriv) - ok = denominator.Backward(-supervision.weight, - nnet_output_deriv); - *objf = num_logprob_weighted - supervision.weight * den_logprob; + *objf = num_logprob_weighted - den_logprob_weighted; *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; if (!((*objf) - (*objf) == 0) || !ok) { @@ -86,7 +93,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, // for different frames of the sequences. As expected, they are // smaller towards the edges of the sequences (due to the penalization // of 'incorrect' pdf-ids. - if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL) { + if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL && RandInt(0, 10) == 0) { int32 tot_frames = nnet_output_deriv->NumRows(), frames_per_sequence = supervision.frames_per_sequence, num_sequences = supervision.num_sequences; diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index e6143d10846..d6535902625 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -63,7 +63,7 @@ struct ChainTrainingOptions { ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0) { } - + void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " "constant for 'chain' training, applied to the output " @@ -107,10 +107,13 @@ struct ChainTrainingOptions { You don't have to zero this before passing to this function, we zero it internally. @param [out] xent_output_deriv If non-NULL, then the numerator part of the derivative - (which equals a posterior from the numerator forward-backward, - scaled by the supervision weight) is written to here. This will - be used in the cross-entropy regularization code. This value - is also used in computing the cross-entropy objective value. + (which equals a posterior from the numerator + forward-backward, scaled by the supervision weight) + is written to here (this function will set it to the + correct size first; doing it this way reduces the + peak memory use). xent_output_deriv will be used in + the cross-entropy regularization code; it is also + used in computing the cross-entropy objective value. */ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, @@ -120,12 +123,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv = NULL); - + CuMatrix *xent_output_deriv = NULL); + } // namespace chain } // namespace kaldi #endif // KALDI_CHAIN_CHAIN_TRAINING_H_ - diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index e6ade23728f..ca831390ea9 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -7,11 +7,11 @@ LDLIBS += $(CUDA_LDLIBS) TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \ cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test \ - cu-sparse-matrix-test cu-device-test cu-rand-speed-test + cu-sparse-matrix-test cu-device-test cu-rand-speed-test cu-compressed-matrix-test OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \ cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \ - cu-sparse-matrix.o cu-allocator.o cu-array.o + cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o ifeq ($(CUDA), true) OBJFILES += cu-kernels.o endif @@ -33,4 +33,3 @@ endif $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ include ../makefiles/default_rules.mk - diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index f2ccf0d6c29..0f96315e848 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -54,7 +54,7 @@ struct CuAllocatorOptions { // is a constant overhead proportional to the number of buckets. BaseFloat delete_factor; - CuAllocatorOptions(): memory_factor(1.5), + CuAllocatorOptions(): memory_factor(1.3), delete_factor(0.001) { } void Check() { diff --git a/src/cudamatrix/cu-compressed-matrix-test.cc b/src/cudamatrix/cu-compressed-matrix-test.cc new file mode 100644 index 00000000000..3cbd7bd5060 --- /dev/null +++ b/src/cudamatrix/cu-compressed-matrix-test.cc @@ -0,0 +1,179 @@ +// cudamatrix/cu-compressed-matrix-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "cudamatrix/cu-matrix-lib.h" + +using namespace kaldi; + + +namespace kaldi { + +void CuCompressedMatrixTestSign() { + int32 num_rows = RandInt(80, 100), + num_cols = RandInt(80, 100); + CuMatrix M(num_rows, num_cols); + M.SetRandn(); + + CuMatrix M2(num_rows, num_cols, kUndefined); + + CuCompressedMatrixBase *cm = NewCuCompressedMatrix(kCompressedMatrixUint8, 0.0); + + // this just stores (M(i, j) > 0 ? 1 : 0). + cm->CopyFromMat(M); + cm->CopyToMat(&M2); + + M.Heaviside(M); + + AssertEqual(M, M2); + delete cm; +} + +void CuCompressedMatrixTestNonnegative() { + int32 num_rows = RandInt(80, 100), + num_cols = RandInt(80, 100); + CuMatrix M(num_rows, num_cols); + M.SetRandUniform(); + + BaseFloat range = 0.5 * RandInt(1, 5); + M.Scale(range); + + CuCompressedMatrixType t = (RandInt(0, 1) == 0 ? + kCompressedMatrixUint8 : + kCompressedMatrixUint16); + + // since the input is in the correct range, truncating or not should make no + // difference. + bool truncate = (RandInt(0, 1) == 0); + + BaseFloat extra_error = 0.0; + if (truncate && (RandInt(0, 1) == 0)) { + // this tests that with truncate == true, adding a small offset, which would + // take us outside the representable range, will not add too much extra + // error. (with truncate == false this would not be true because we wouldn't + // round to the edges of the range, it would wrap around). + extra_error = -0.01 * (RandInt(0, 1) == 0 ? 1.0 : -1.0); + M.Add(extra_error); + } + + CuCompressedMatrixBase *cm = NewCuCompressedMatrix(t, range, truncate); + + CuMatrix M2(num_rows, num_cols, kUndefined); + + cm->CopyFromMat(M); + cm->CopyToMat(&M2); + + + M2.AddMat(-1.0, M); + + BaseFloat diff_max = M2.Max(), + diff_min = M2.Min(); + + BaseFloat + headroom = 1.1, + max_expected_error = fabs(extra_error) + headroom * 0.5 * + range / (t == kCompressedMatrixUint8 ? 255 : 65535); + + KALDI_ASSERT(diff_max < max_expected_error && + diff_min > -1.0 * max_expected_error); + + delete cm; +} + +// this is like CuCompressedMatrixTestNonnegative but +// with signed integers, and input in the range [-range, +range]. +void CuCompressedMatrixTestSymmetric() { + int32 num_rows = RandInt(80, 100), + num_cols = RandInt(80, 100); + CuMatrix M(num_rows, num_cols); + M.SetRandUniform(); + M.Scale(2.0); + M.Add(-1.0); + + BaseFloat range = 0.5 * RandInt(1, 5); + M.Scale(range); + + CuCompressedMatrixType t = (RandInt(0, 1) == 0 ? + kCompressedMatrixInt8 : + kCompressedMatrixInt16); + + // since the input is in the correct range, truncating or not should make no + // difference. + bool truncate = (RandInt(0, 1) == 0); + + BaseFloat extra_error = 0.0; + if (truncate && (RandInt(0, 1) == 0)) { + // this tests that with truncate == true, adding a small offset, which would + // take us outside the representable range, will not add too much extra + // error. (with truncate == false this would not be true because we wouldn't + // round to the edges of the range, it would wrap around). + extra_error = -0.01 * (RandInt(0, 1) == 0 ? 1.0 : -1.0); + M.Add(extra_error); + } + + CuCompressedMatrixBase *cm = NewCuCompressedMatrix(t, range, truncate); + + CuMatrix M2(num_rows, num_cols, kUndefined); + + cm->CopyFromMat(M); + cm->CopyToMat(&M2); + + + M2.AddMat(-1.0, M); + + BaseFloat diff_max = M2.Max(), + diff_min = M2.Min(); + + BaseFloat + headroom = 1.1, + max_expected_error = fabs(extra_error) + headroom * 0.5 * + range / (t == kCompressedMatrixInt8 ? 127 : 32767); + + KALDI_ASSERT(diff_max < max_expected_error && + diff_min > -1.0 * max_expected_error); + + delete cm; +} + + + +} // namespace kaldi + + +int main() { + SetVerboseLevel(1); + // we don't run this test if CUDA is not compiled in, since + // you can't instantiate class CuCompressedMatrix in that case. +#if HAVE_CUDA == 1 + CuDevice::Instantiate().SelectGpuId("yes"); + for (int32 i = 1; i < 10; i++) { + CuCompressedMatrixTestSign(); + CuCompressedMatrixTestNonnegative(); + CuCompressedMatrixTestSymmetric(); + } + +#endif + return 0; +} diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc new file mode 100644 index 00000000000..be02921169d --- /dev/null +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -0,0 +1,142 @@ +// cudamatrix/cu-compressed-matrix.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#if HAVE_CUDA == 1 +#include +#include +#endif + +#include "base/timer.h" +#include "cudamatrix/cu-common.h" +#include "cudamatrix/cu-vector.h" +#include "cudamatrix/cu-device.h" +#include "cudamatrix/cu-kernels.h" +#include "cudamatrix/cu-array.h" +#include "cudamatrix/cu-compressed-matrix.h" + +namespace kaldi { + + +template +CuCompressedMatrix::CuCompressedMatrix(BaseFloat range, bool truncate): + data_(NULL), scale_(range / std::numeric_limits::max()), + truncate_(truncate), num_rows_(0), num_cols_(0), stride_(0) { +#if HAVE_CUDA == 1 + KALDI_ASSERT(CuDevice::Instantiate().Enabled()); +#else + KALDI_ERR << "You instantiated CuCompressedMatrix while GPU use " + "was not compiled in."; +#endif +} + +template +void CuCompressedMatrix::Destroy() { +#if HAVE_CUDA == 1 + if (data_ != NULL) { + // we don't bother timing this because Free() won't normally have to + // access the GPU at all (due to caching). + CuDevice::Instantiate().Free(data_); + data_ = NULL; + num_rows_ = 0; + num_cols_ = 0; + stride_ = 0; + } +#endif +} + +template +void CuCompressedMatrix::CopyFromMat( + const CuMatrixBase &mat) { +#if HAVE_CUDA == 1 + KALDI_ASSERT(CuDevice::Instantiate().Enabled()); + if (mat.NumRows() == 0) + return; + if (num_rows_ != mat.NumRows() || num_cols_ != mat.NumCols()) { + Destroy(); + num_rows_ = mat.NumRows(); + num_cols_ = mat.NumCols(); + data_ = static_cast( + CuDevice::Instantiate().Malloc(sizeof(I) * num_rows_ * num_cols_)); + stride_ = num_cols_; + } + + { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + + if (scale_ == 0.0) { // scale == 0 calls a different kernel from the others. + cuda_mat_compress_sign(dimGrid, dimBlock, mat.Data(), mat.Dim(), + data_, stride_); + } else { + cuda_mat_compress(dimGrid, dimBlock, mat.Data(), mat.Dim(), + data_, stride_, float(1.0 / scale_), + truncate_); + } + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim); + } +#endif +} + +template +void CuCompressedMatrix::CopyToMat(CuMatrixBase *mat) const { +#if HAVE_CUDA == 1 + KALDI_ASSERT(CuDevice::Instantiate().Enabled()); + KALDI_ASSERT(mat->NumRows() == num_rows_ && mat->NumCols() == num_cols_); + { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + BaseFloat scale = (scale_ == 0.0 ? 1.0 : scale_); + cuda_mat_uncompress(dimGrid, dimBlock, mat->Data(), mat->Dim(), + data_, stride_, float(scale)); + } +#endif +} + + +CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t, + BaseFloat range, + bool truncat) { + if (t == kCompressedMatrixUint8) { + KALDI_ASSERT(range >= 0); + return new CuCompressedMatrix(range); + } else if (t == kCompressedMatrixInt8) { + KALDI_ASSERT(range >= 0); + return new CuCompressedMatrix(range); + } else if (t == kCompressedMatrixUint16) { + KALDI_ASSERT(range > 0); + return new CuCompressedMatrix(range); + } else if (t == kCompressedMatrixInt16) { + KALDI_ASSERT(range > 0); + return new CuCompressedMatrix(range); + } else { + KALDI_ERR << "Unknown compressed-matrix type"; + return NULL; + } +} + + + +} // namespace kaldi diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h new file mode 100644 index 00000000000..1ef7853b906 --- /dev/null +++ b/src/cudamatrix/cu-compressed-matrix.h @@ -0,0 +1,162 @@ +// cudamatrix/cu-compressed-matrix.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + + +#ifndef KALDI_CUDAMATRIX_CU_COMPRESSED_MATRIX_H_ +#define KALDI_CUDAMATRIX_CU_COMPRESSED_MATRIX_H_ + +#include "cudamatrix/cu-matrix.h" + +namespace kaldi { + +/** + Class CuCompressedMatrixBase is an abstract base class that allows you to + compress a matrix of type CuMatrix. When you instantiate it you + would choose the child-class type (by allocating the appropriate child-class + type via 'new'). + */ +class CuCompressedMatrixBase { + public: + + /// Sets *this to an appropriately compressed copy of 'mat', which + /// includes resizing *this. The details of how this is done will be + /// different in different child classes. + virtual void CopyFromMat(const CuMatrixBase &mat) = 0; + + /// Copies the contents of *this to 'mat', which should be + /// correctly sized beforehand. + virtual void CopyToMat(CuMatrixBase *mat) const = 0; + + + // The number of rows in *this. + virtual int32 NumRows() const = 0; + + // The number of columns in *this. + virtual int32 NumCols() const = 0; + + virtual ~CuCompressedMatrixBase() { } +}; + + + +/** + Class CuCompressedMatrix, templated on an integer type (expected to be one + of: int8, uint8, int16, uint16), this provides a way to approximate a + CuMatrix in a more memory-efficient format. It's used in nnet3 to + reduce memory use for large networks. + + It is *not* a CUDA equivalent for class CompressedMatrix (of + ../matrix/compressed-matrix.h). Note: this class is only to be used when you + are using a GPU. If you didn't compile for CUDA or you are not using a GPU, + you are not supposed to create an instance of this class, and doing so will + cause a runtime error. + */ +template +class CuCompressedMatrix: public CuCompressedMatrixBase { + public: + + /// Constructor which sets 'scale_' according to + /// scale_ = range / std::numeric_limits::max(). + /// + /// range = 0 (only supported for I == int8) is a special case in which only + /// the sign of the input is retained; and when we reconstruct, the output + /// will be -1, 0 or 1. + /// + /// truncate (only relevant if range != 0) should be true if it's possible + /// that the input could exceed the allowed input range, i.e. [0, range] if I + /// is unsigned, and [-range, range] if I is signed; and it may be false if + /// you know that the input (the matrix given to CopyFromMat) will have + /// elements only in the allowed range. Setting 'truncate' to false + /// allows the compression code to avoid the bounds check. + CuCompressedMatrix(BaseFloat range, bool truncate = true); + + virtual void CopyFromMat(const CuMatrixBase &mat); + + virtual void CopyToMat(CuMatrixBase *mat) const; + + virtual MatrixIndexT NumRows() const { return num_rows_; } + + virtual MatrixIndexT NumCols() const { return num_cols_; } + + + virtual ~CuCompressedMatrix() { Destroy(); } + + private: + // If there was data in 'data_', frees it, and sets it to NULL. + void Destroy(); + + // The raw data. + I *data_; + + // scale_ affects how the raw data is interpreted as a floating point value. + // When uncompressing to a CuMatrix, we'll do: + // f = scale_ * i + // where f is the floating point value we're writing to, and i is the integer + // value. + // + // scale_ = 0 is treated specially; in this case we just take notice of the + // sign of the input, and when uncompressing we do it with a scale such + // that the output becomes -1, 0 and 1. + BaseFloat scale_; + + // 'truncate_' affects the code that compresses data to integer values. + // If the data we're compressing might possibly be outside of the representable + // range, then you should set truncate to true (this is the default in the + // constructor). This way, values larger than the minimum or maximum will + // be set to the minimum or maximum value. If truncate_ is false, it will + // just wrap around, but the compression code will be slightly faster as + // it doesn't need to check. + bool truncate_; + + MatrixIndexT num_rows_; + MatrixIndexT num_cols_; + // stride_ is currently always equal to num_cols_; it was added mainly to + // point the way to possible future extension. + MatrixIndexT stride_; +}; + + + +// This enum value is used to encode the type you want to instantiate +// a CuCompressedMatrix with. It's used in class NnetComputation +// (cast to int32) as one of the arguments of kCompressMatrix. +enum CuCompressedMatrixType { + kCompressedMatrixInt8 = 1, + kCompressedMatrixUint8 = 2, + kCompressedMatrixInt16 = 3, + kCompressedMatrixUint16 = 4 +}; + +/** + This function allocates a new CuCompressedMatrix with type determined + by t, and with the 'range' and 'truncate' parameters provided to the + constructor of class CuCompressedMatrix. + + It will crash at runtime if called when CUDA is not compiled in, or not + enabled. + */ +CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t, + BaseFloat range, + bool truncate = true); + + +} // namespace kaldi + +#endif diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 9b0976b05ad..87e266e1889 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -58,6 +58,15 @@ namespace kaldi { */ static bool GetCudaContext(int32 num_gpus, std::string *debug_str) { + + // Our first attempt to get a device context is: we do cudaFree(0) and see if + // that returns no error code. If it succeeds then we have a device + // context. Apparently this is the canonical way to get a context. + if (cudaFree(0) == 0) + return true; + + // The rest of this code represents how we used to get a device context, but + // now its purpose is mainly a debugging one. std::ostringstream debug_stream; debug_stream << "num-gpus=" << num_gpus << ". "; for (int32 device = 0; device < num_gpus; device++) { @@ -220,9 +229,9 @@ void CuDevice::FinalizeActiveGpu() { } // Remember the id of active GPU active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on - // Initialize the CUBLAS + // Initialize CUBLAS. CUBLAS_SAFE_CALL(cublasCreate(&handle_)); - // Initialize the cuSPARSE + // Initialize the cuSPARSE library CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_)); // Notify user which GPU is finally used diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 7d2db9adcc9..8ab03c7e14e 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -30,6 +30,15 @@ #if HAVE_CUDA == 1 extern "C" { +// "C" version of the BaseFloat typedef-- this saves us having to write +// multiple versions of these kernels. +#if (KALDI_DOUBLEPRECISION != 0) +typedef double BaseFloat; +#else +typedef float BaseFloat; +#endif + + void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat, const MatrixDim d, const double alpha, const double beta); @@ -736,6 +745,42 @@ void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim); void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc); void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc); + +void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check); +void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check); +void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check); +void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check); + +void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8_t *dest, int dest_stride); + +void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int16_t *src, + int src_stride, float scale); +void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint16_t *src, + int src_stride, float scale); +void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int8_t *src, + int src_stride, float scale); +void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint8_t *src, + int src_stride, float scale); + + + } // extern "C" #endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 2f8f37224be..ae7e25b716d 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -3558,6 +3558,104 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m } } + +__global__ +static void _cuda_compress_uint8_sign(const BaseFloat *src, MatrixDim dim, + unsigned char *dest, int dest_stride) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int dest_index = i + j * dest_stride, + src_index = i + j * dim.stride; + if (i < dim.cols && j < dim.rows) { + BaseFloat f = src[src_index]; + dest[dest_index] = (f > 0.0 ? (unsigned char)1 : (unsigned char)0); + } +} + + +// The following inline templated functions are a workaround for the +// fact that (I believe) std::numeric_limits is not available in CUDA; +// they allow us to access the minimum and maximum elements of certain +// types from templated code. +template __device__ static inline int minimum_integer_value(); +template __device__ static inline int maximum_integer_value(); + +template<> __device__ int maximum_integer_value() { return 127; } +template<> __device__ int minimum_integer_value() { return -128; } +template<> __device__ int maximum_integer_value() { return 255; } +template<> __device__ int minimum_integer_value() { return 0; } +template<> __device__ int maximum_integer_value() { return 32767; } +template<> __device__ int minimum_integer_value() { return -32768; } +template<> __device__ int maximum_integer_value() { return 65535; } +template<> __device__ int minimum_integer_value() { return 0; } + + + +template +__global__ +static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim, + I *dest, int dest_stride, float inv_scale) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int dest_index = i + j * dest_stride, + src_index = i + j * dim.stride; + const int min_value = minimum_integer_value(), + max_value = maximum_integer_value(); + int compressed_value; + int ok = (i < dim.cols && j < dim.rows); + if (ok) { + float f = src[src_index]; + // note: I'm not sure what __float2int_rn does if input is outside of + // integer range, but it doesn't matter much as in the situations where this + // type of compression would make sense, the input should be well inside the + // range of 'int', and if it fails, we've probably already catastrophically + // diverged. + int i = __float2int_rn(f * inv_scale); + if (i < min_value) compressed_value = min_value; + else if (i > max_value) compressed_value = max_value; + else compressed_value = i; + } + __syncthreads(); + if (ok) { + dest[dest_index] = compressed_value; + } +} + + +template +__global__ +static void _cuda_compress_no_bounds_check(const BaseFloat *src, MatrixDim dim, + I *dest, int dest_stride, + float inv_scale) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int dest_index = i + j * dest_stride, + src_index = i + j * dim.stride; + if (i < dim.cols && j < dim.rows) { + float f = src[src_index]; + int i = __float2int_rn(f * inv_scale); + I s = i; + dest[dest_index] = s; + } +} + +template +__global__ +static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim, + const I *src, int src_stride, + float scale) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int src_index = i + j * src_stride, + dest_index = i + j * dim.stride; + if (i < dim.cols && j < dim.rows) { + I s = src[src_index]; + dest[dest_index] = float(s * scale); + } +} + + + /*********************************************************************** * ANSI-C wrappers of CUDA kernels */ @@ -5220,3 +5318,69 @@ void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim, _apply_exp_special<<>>(out, out_dim, in, in_stride); } +void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim, + unsigned char *dest, int dest_stride) { + _cuda_compress_uint8_sign<<>>(src, dim, dest, dest_stride); +} + +void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + if (bounds_check) { + _cuda_compress_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } else { + _cuda_compress_no_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } +} +void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + if (bounds_check) { + _cuda_compress_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } else { + _cuda_compress_no_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } +} +void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + if (bounds_check) { + _cuda_compress_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } else { + _cuda_compress_no_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } +} +void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + if (bounds_check) { + _cuda_compress_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } else { + _cuda_compress_no_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } +} + +void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint8_t *src, + int src_stride, float scale) { + _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); +} +void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int8_t *src, + int src_stride, float scale) { + _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); +} +void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint16_t *src, + int src_stride, float scale) { + _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); +} +void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int16_t *src, + int src_stride, float scale) { + _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); +} diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 27ccf760557..3518e0c71ed 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -1463,6 +1463,73 @@ inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, cudaF_vec_sum(Gr, Bl, v, value, dim, inc); } +// Compresses the matrix in 'src' to 'dest', retaining only zero-one +// information (1 if the value is >0, 0 otherwise) +inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8 *dest, + int dest_stride) { + cuda_compress_uint8_sign(Gr, Bl, src, dim, dest, dest_stride); +} +// this template handles the other types that are not instantiated yet, +// to avoid compilation errors. +template +inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, I *dest, + int dest_stride) { + KALDI_ERR << "Not implemented for this type."; +} + +inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + cuda_compress_int16(Gr, Bl, src, dim, dest, dest_stride, + inv_scale, bounds_check); +} +inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + cuda_compress_uint16(Gr, Bl, src, dim, dest, dest_stride, + inv_scale, bounds_check); +} +inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + cuda_compress_uint8(Gr, Bl, src, dim, dest, dest_stride, + inv_scale, bounds_check); +} +inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + cuda_compress_int8(Gr, Bl, src, dim, dest, dest_stride, + inv_scale, bounds_check); +} + +inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int8_t *src, + int src_stride, float scale) { + cuda_uncompress_int8(Gr, Bl, dest, dim, src, src_stride, scale); +} +inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint8_t *src, + int src_stride, float scale) { + cuda_uncompress_uint8(Gr, Bl, dest, dim, src, src_stride, scale); +} +inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int16_t *src, + int src_stride, float scale) { + cuda_uncompress_int16(Gr, Bl, dest, dim, src, src_stride, scale); +} +inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint16_t *src, + int src_stride, float scale) { + cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale); +} + + } // namespace kaldi #endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-matrix-lib.h b/src/cudamatrix/cu-matrix-lib.h index ef21a2945f1..1da7efafc97 100644 --- a/src/cudamatrix/cu-matrix-lib.h +++ b/src/cudamatrix/cu-matrix-lib.h @@ -29,5 +29,6 @@ #include "cudamatrix/cu-sparse-matrix.h" #include "cudamatrix/cu-block-matrix.h" #include "cudamatrix/cu-rand.h" +#include "cudamatrix/cu-compressed-matrix.h" #endif diff --git a/src/doc/get_version_info.sh b/src/doc/get_version_info.sh index 422798905f5..c11fb7f805e 100755 --- a/src/doc/get_version_info.sh +++ b/src/doc/get_version_info.sh @@ -1,6 +1,6 @@ #!/bin/bash -# search for VERSIONS below to see how to change this when +# search for VERSION below to see how to change this when # Kaldi's version number increases. # Note: this script assumes that it's part of a git repository where @@ -42,7 +42,8 @@ fi # Note: when you add new tuples here you'll also want to add ndew # \htmlinclude directives in versions.dox. -for tuple in "5.0 5.0 c160a9883" "5.1 5.1 2145519961" "5.2 5.2 393ef73caa93" "5.3 master 131cdd4cb544"; do +for tuple in "5.0 5.0 c160a9883" "5.1 5.1 2145519961" "5.2 5.2 393ef73caa93" "5.3 5.3 131cdd4cb544" \ + "5.4 master be969d7baf04"; do major_minor_number=$(echo $tuple | awk '{print $1}') # e.g. 5.0 branch=$(echo $tuple | awk '{print $2}') # e.g. 'master', or '5.1' (it's a branch name) first_commit=$(echo $tuple | awk '{print $3}') diff --git a/src/doc/lattices.dox b/src/doc/lattices.dox index 714d9de6f2e..0b222ec5f1a 100644 --- a/src/doc/lattices.dox +++ b/src/doc/lattices.dox @@ -264,8 +264,10 @@ has the same effect as calling that the normal OpenFst RemoveEps() and Determini \section lattices_generation Lattice generation -Currently, the only decoder that generates lattices is the class -LatticeSimpleDecoder, defined in decoder/lattice-simple-decoder.h, and invoked by gmm-latgen-simple.cc. +Command-line decoding programs that have 'latgen' in their names generate lattices. +Currently most of these use LatticeFasterDecoder. For purposes of exposition we will +focus instead on LatticeSimpleDecoder, whose operation is simpler. +This is defined in decoder/lattice-simple-decoder.h, and invoked by gmm-latgen-simple.cc. As the name suggests, LatticeSimpleDecoder is a lattice-generating decoder that is modified from SimpleDecoder. SimpleDecoder is a straightforwardly implemented Viterbi beam search algorithm with only a single tunable parameter: the pruning beam (see \ref decoders_simple). LatticeSimpleDecoder has diff --git a/src/doc/versions.dox b/src/doc/versions.dox index 9461ef1e873..d12b8621ccd 100644 --- a/src/doc/versions.dox +++ b/src/doc/versions.dox @@ -116,7 +116,7 @@ \subsection versions_versions_53 Version 5.3 - Version 5.3 is the current master branch. Major changes that were made between the end of 5.2.x + Major changes that were made between the end of 5.2.x and the start of the 5.3 branch include: - Create a nnet3-based setup for RNN language models (i.e. recurrent and neural net based language models) @@ -127,4 +127,24 @@ \htmlinclude 5.3.html + \subsection versions_versions_54 Version 5.4 + + + Version 5.4 is the current master branch. The main changes that were made between + the end of 5.3.x and the start of the 5.4 branch include: + - Some code changes in the nnet3 codebase, for speed and memory efficiency. + - Various simplifications and code reorganizations in the nnet3 code. + - Support for a new kind of factorized TDNN which gives substantially better + results than our old TDNN recipe, and is even better than our old TDNN+LSTM + recipe. A good example of this is in egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh. + Some nnet3 code changes were needed for this as well (mostly: support for constraining + a matrix to have orthonormal rows). + + Below are patches corresponding to minor version numbers 5.4.x. + + \htmlinclude 5.4.html + + + + */ diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index 8ddba56b0e0..df0fb2d4502 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -15,7 +15,7 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \ nnet-common-test convolution-test attention-test OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \ - nnet-simple-component.o \ + nnet-simple-component.o nnet-normalize-component.o \ nnet-general-component.o nnet-parse.o natural-gradient-online.o \ nnet-descriptor.o nnet-optimize.o nnet-computation.o \ nnet-computation-graph.o nnet-graph.o am-nnet-simple.o \ diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc index 88a14616f9d..445cc43f868 100644 --- a/src/nnet3/natural-gradient-online-test.cc +++ b/src/nnet3/natural-gradient-online-test.cc @@ -271,7 +271,7 @@ void UnitTestPreconditionDirectionsOnline() { if (Rand() % 3 == 0) zero = true; //else if (Rand() % 2 == 0) one = true; - CuVector row_prod1(N), row_prod2(N); + CuVector row_prod1(N); BaseFloat gamma1, gamma2; BaseFloat big_eig_factor = RandInt(1, 20); big_eig_factor = big_eig_factor * big_eig_factor; @@ -301,14 +301,13 @@ void UnitTestPreconditionDirectionsOnline() { preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1); - preconditioner2.PreconditionDirections(&Mcopy2, &row_prod2, &gamma2); + preconditioner2.PreconditionDirections(&Mcopy2, &gamma2); BaseFloat trace1 = TraceMatMat(M, M, kTrans), trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans); AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02); AssertEqual(Mcopy1, Mcopy2); - AssertEqual(row_prod1, row_prod2, 1.0e-02); AssertEqual(gamma1, gamma2, 1.0e-02); // make sure positive definite diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc index 5ef413b9f38..b5740053f46 100644 --- a/src/nnet3/natural-gradient-online.cc +++ b/src/nnet3/natural-gradient-online.cc @@ -18,6 +18,7 @@ // limitations under the License. #include "nnet3/natural-gradient-online.h" +#include "nnet3/nnet-parse.h" namespace kaldi { namespace nnet3 { @@ -26,8 +27,8 @@ namespace nnet3 { OnlineNaturalGradient::OnlineNaturalGradient(): rank_(40), update_period_(1), num_samples_history_(2000.0), num_minibatches_history_(0.0), alpha_(4.0), - epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(-1), - num_updates_skipped_(0), self_debug_(false) { } + epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0), + self_debug_(false) { } /** @@ -123,6 +124,7 @@ void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { // for locking reasons it's better to use a different object. OnlineNaturalGradient this_copy(*this); this_copy.InitDefault(D); + this_copy.t_ = 1; // Prevent recursion to Init() again. CuMatrix R0_copy(R0.NumRows(), R0.NumCols(), kUndefined); // 'num_iters' is number of iterations with the same data from a pseudorandom @@ -146,52 +148,53 @@ void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { for (int32 i = 0; i < num_init_iters; i++) { BaseFloat scale; R0_copy.CopyFromMat(R0); - this_copy.PreconditionDirections(&R0_copy, NULL, &scale); + this_copy.PreconditionDirections(&R0_copy, &scale); } rank_ = this_copy.rank_; W_t_.Swap(&this_copy.W_t_); d_t_.Swap(&this_copy.d_t_); rho_t_ = this_copy.rho_t_; - t_ = 0; } void OnlineNaturalGradient::PreconditionDirections( CuMatrixBase *X_t, - CuVectorBase *row_prod, BaseFloat *scale) { if (X_t->NumCols() == 1) { // If the dimension of the space equals one then our natural gradient update // with rescaling becomes a no-op, but the code wouldn't naturally handle it // because rank would be zero. Support this as a special case. - if (row_prod) - row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0); - *scale = 1.0; - return; - } - - if (row_prod == NULL) { - CuVector row_prod_tmp(X_t->NumRows()); - PreconditionDirections(X_t, &row_prod_tmp, scale); + if (scale) + *scale = 1.0; return; } - read_write_mutex_.lock(); - if (t_ == -1) // not initialized + if (t_ == 0) // not initialized Init(*X_t); - // Now t_ >= 0. - // We create local copies of the class variables... this is intended for - // multi-threaded safety so we can't read them in an inconsistent state, - // but we don't really waste anything here (a copy of W_t is needed anyway, - // if we're to update it). - int32 t = t_, R = W_t_.NumRows(), D = W_t_.NumCols(); + int32 R = W_t_.NumRows(), D = W_t_.NumCols(); // space for W_t, J_t, K_t, L_t. CuMatrix WJKL_t(2 * R, D + R); WJKL_t.Range(0, R, 0, D).CopyFromMat(W_t_); BaseFloat rho_t(rho_t_); Vector d_t(d_t_); - read_write_mutex_.unlock(); - PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale); + + bool updating = Updating(); + + BaseFloat initial_product; + initial_product = TraceMatMat(*X_t, *X_t, kTrans); + + PreconditionDirectionsInternal(rho_t, initial_product, + updating, d_t, &WJKL_t, X_t); + + if (scale) { + if (initial_product <= 0.0) { + *scale = 1.0; + } else { + BaseFloat final_product = TraceMatMat(*X_t, *X_t, kTrans); + *scale = sqrt(initial_product / final_product); + } + } + t_ += 1; } void OnlineNaturalGradient::ReorthogonalizeXt1( @@ -318,13 +321,12 @@ void OnlineNaturalGradient::SelfTest() const { } void OnlineNaturalGradient::PreconditionDirectionsInternal( - const int32 t, const BaseFloat rho_t, + const BaseFloat tr_X_Xt, + bool updating, const Vector &d_t, CuMatrixBase *WJKL_t, - CuMatrixBase *X_t, - CuVectorBase *row_prod, - BaseFloat *scale) { + CuMatrixBase *X_t) { int32 N = X_t->NumRows(), // Minibatch size. D = X_t->NumCols(), // Dimensions of vectors we're preconditioning R = rank_; // Rank of correction to unit matrix. @@ -343,57 +345,11 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0); // H_t = X_t W_t^T - bool locked = update_mutex_.try_lock(); - if (locked) { - // We'll release the lock if we don't plan to update the parameters. - - // Explanation of the conditions below: - // if (frozen_) because we don't do the update is the user called Freeze(). - // I forget why the (t_ > t) is here; probably some race condition encountered - // a long time ago. Not important; nnet3 doesn't use multiple threads anyway. - // The condition: - // (num_updates_skipped_ < update_period_ - 1 && t_ >= num_initial_updates) - // means we can update if either we're in the first 10 updates (e.g. first - // 10 minibatches), or if we've skipped 'update_period_ - 1' batches of data - // without updating the parameters (this allows us to update only, say, - // every 4 times, for speed, after updating the first 10 times). - - // Just hard-code it here that we do 10 initial updates before skipping any. - const int num_initial_updates = 10; - if (frozen_ || t_ > t || (num_updates_skipped_ < update_period_ - 1 && - t_ >= num_initial_updates)) { - update_mutex_.unlock(); - // We got the lock but we were already beaten to it by another thread, or - // we don't want to update yet due to update_period_ > 1 (this saves - // compute), so release the lock. - locked = false; - } - } - - if (!locked) { - // We're not updating the parameters, either because another thread is - // working on updating them, or because another thread already did so from - // the same or later starting point (making our update stale), or because - // update_period_ > 1. We just apply the preconditioning and return. - - // note: we don't bother with any locks before checking frozen_ or incrementing - // num_updates_skipped_ below, because the worst that could happen is that, - // on very rare occasions, we could skip one or two more updates than we - // intended. - if (!frozen_) - num_updates_skipped_++; - - BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans); + if (!updating) { + // We're not updating the estimate of the Fisher matrix; we just apply the + // preconditioning and return. // X_hat_t = X_t - H_t W_t X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); - // each element i of row_prod will be inner product of row i of X_hat_t with - // itself. - row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0); - BaseFloat tr_Xhat_XhatT = row_prod->Sum(); - KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT); // Check for NaN. - BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 : - sqrt(tr_Xt_XtT / tr_Xhat_XhatT)); - *scale = gamma_t; return; } J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0); // J_t = H_t^T X_t @@ -457,31 +413,14 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( if (nf > 0 && self_debug_) { KALDI_WARN << "Floored " << nf << " elements of C_t."; } - BaseFloat tr_Xt_XtT_check; - if (self_debug_) - tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans); X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); // X_hat_t = X_t - H_t W_t - // set *row_prod to inner products of each row of X_hat_t with itself. - row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0); - - BaseFloat tr_Xhat_XhatT = row_prod->Sum(); - // tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t) - double tr_Xt_XtT = tr_Xhat_XhatT; - for (int32 i = 0; i < R; i++) - tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i)); - if (self_debug_) { - KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check)); - } - BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 : - sqrt(tr_Xt_XtT / tr_Xhat_XhatT)); - *scale = gamma_t; Vector sqrt_c_t(c_t); sqrt_c_t.ApplyPow(0.5); // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})). - BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT + BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_X_Xt + (1-eta)*(D * rho_t + d_t.Sum()) - sqrt_c_t.Sum()); // D_{t+1} = C_t^{0.5} - \rho_{t+1} I @@ -507,22 +446,25 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( &L_t); } - // Commit the new parameters. - read_write_mutex_.lock(); - KALDI_ASSERT(t_ == t); // we already ensured this. - t_ = t + 1; - num_updates_skipped_ = 0; W_t_.Swap(&W_t1); d_t_.CopyFromVec(d_t1); rho_t_ = rho_t1; if (self_debug_) SelfTest(); +} + +bool OnlineNaturalGradient::Updating() const { + // Just hard-code it here that we do 10 initial updates before skipping any. + // This must be > 'num_init_iters = 3' from Init(). + const int num_initial_updates = 10; - read_write_mutex_.unlock(); - update_mutex_.unlock(); + return (!frozen_ && + (t_ <= num_initial_updates || + (t_ - num_initial_updates) % update_period_ == 0)); } + BaseFloat OnlineNaturalGradient::Eta(int32 N) const { if (num_minibatches_history_ > 0.0) { KALDI_ASSERT(num_minibatches_history_ > 1.0); @@ -636,12 +578,10 @@ OnlineNaturalGradient::OnlineNaturalGradient(const OnlineNaturalGradient &other) num_samples_history_(other.num_samples_history_), num_minibatches_history_(other.num_minibatches_history_), alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_), - frozen_(other.frozen_), - t_(other.t_), num_updates_skipped_(other.num_updates_skipped_), + frozen_(other.frozen_), t_(other.t_), self_debug_(other.self_debug_), W_t_(other.W_t_), - rho_t_(other.rho_t_), d_t_(other.d_t_) { - // use default constructor for the mutexes. -} + rho_t_(other.rho_t_), d_t_(other.d_t_) { } + OnlineNaturalGradient& OnlineNaturalGradient::operator = ( const OnlineNaturalGradient &other) { diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h index 67c25eb0dbc..0b05948977e 100644 --- a/src/nnet3/natural-gradient-online.h +++ b/src/nnet3/natural-gradient-online.h @@ -437,33 +437,52 @@ class OnlineNaturalGradient { // see comment where 'frozen_' is declared. inline void Freeze(bool frozen) { frozen_ = frozen; } - // The "R" pointer is both the input (R in the comment) and the output (P in - // the comment; equal to the preconditioned directions before scaling by - // gamma). If the pointer "row_prod" is supplied, it's set to the inner product - // of each row of the preconditioned directions P, at output, with itself. - // You would need to apply "scale" to R and "scale * scale" to row_prod, to - // get the preconditioned directions; we don't do this ourselves, in order to - // save CUDA calls. + /** + This call implements the main functionality of this class. + + @param [in,out] R The "R" pointer is both the input (R in the + comment, X in the paper), and the output (P in the comment, + X with a hat on it in the paper). Each row of R is viewed + as a vector in some space, where we're estimating a smoothed + Fisher matrix and then multiplying by the inverse of that + smoothed Fisher matrix. + + @param [out] scale If non-NULL, a scaling factor is written to here, + and the output 'R' should be multiplied by this factor by + the user (we don't do it internally, to save an operation). + The factor is chosen so that the vector 2-norm of R is the + same after the natural gradient as it was before. (The pointer + being NULL or non-NULL doesn't affect the magnitude of R; + in any case the user will probably want to do this rescaling, + the question being whether they want to do so manually or + not. + + */ void PreconditionDirections(CuMatrixBase *R, - CuVectorBase *row_prod, BaseFloat *scale); + + // Copy constructor. explicit OnlineNaturalGradient(const OnlineNaturalGradient &other); // Assignent operator OnlineNaturalGradient &operator = (const OnlineNaturalGradient &other); private: - // This does the work of PreconditionDirections (the top-level - // function handles some multithreading issues and then calls this function). + + // This is an internal function called from PreconditionDirections(). // Note: WJKL_t (dimension 2*R by D + R) is [ W_t L_t; J_t K_t ]. - void PreconditionDirectionsInternal(const int32 t, - const BaseFloat rho_t, + void PreconditionDirectionsInternal(const BaseFloat rho_t, + const BaseFloat tr_X_Xt, + bool updating, const Vector &d_t, CuMatrixBase *WJKL_t, - CuMatrixBase *X_t, - CuVectorBase *row_prod, - BaseFloat *scale); + CuMatrixBase *X_t); + + + // Works out from t_ and various class variables whether we will update + // the parameters on this iteration (returns true if so). + bool Updating() const; void ComputeEt(const VectorBase &d_t, BaseFloat beta_t, @@ -512,10 +531,14 @@ class OnlineNaturalGradient { // or columns. static void InitOrthonormalSpecial(CuMatrixBase *R); - // Returns the learning rate eta as the function of the number of samples - // (actually, N is the number of vectors we're preconditioning, which due to - // context is not always exactly the same as the number of samples). The - // value returned depends on num_samples_history_. + // Returns the value eta (with 0 < eta < 1) which reflects how fast we update + // the estimate of the Fisher matrix (larger == faster). This is a function + // rather than a constant because we set this indirectly, via + // num_samples_history_ or num_minibatches_history_. The argument N is the + // number of vectors we're preconditioning, which is the number of rows in the + // argument R to PreconditionDirections(); you can think of it as the number + // of vectors we're preconditioning (and in the common case it's some multiple + // of the minibatch size) BaseFloat Eta(int32 N) const; // called if self_debug_ = true, makes sure the members satisfy certain @@ -577,29 +600,16 @@ class OnlineNaturalGradient { // the *second* time we see the same data (to avoid biasing the update). bool frozen_; - // t is a counter that measures how many updates we've done. + // t is a counter that measures how many times the user has previously called + // PreconditionDirections(); it's 0 if that has never been called. int32 t_; - // This keeps track of how many minibatches we've skipped updating the parameters, - // since the most recent update; it's used in enforcing "update_period_", which - // is a mechanism to avoid spending too much time updating the subspace (which can - // be wasteful). - int32 num_updates_skipped_; - // If true, activates certain checks. bool self_debug_; CuMatrix W_t_; BaseFloat rho_t_; Vector d_t_; - - - // Used to prevent parameters being read or written in an inconsistent state. - std::mutex read_write_mutex_; - - // This mutex is used to control which thread gets to update the - // parameters, in multi-threaded code. - std::mutex update_mutex_; }; } // namespace nnet3 diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc index 9c48744fadc..ec1d3fa0f2e 100644 --- a/src/nnet3/nnet-analyze.cc +++ b/src/nnet3/nnet-analyze.cc @@ -238,6 +238,23 @@ std::string ComputationVariables::DescribeVariable(int32 variable) const { return os.str(); } +NnetComputation::SubMatrixInfo ComputationVariables::VariableInfo( + int32 variable) const { + KALDI_ASSERT(variable >= 0 && variable < num_variables_); + int32 matrix_index = variable_to_matrix_[variable], + offset = variable - matrix_to_variable_index_[matrix_index], + num_column_variables = column_split_points_[matrix_index].size() - 1, + column_variable = offset % num_column_variables, + row_variable = offset / num_column_variables; + int32 row_offset = row_split_points_[matrix_index][row_variable], + num_rows = row_split_points_[matrix_index][row_variable+1] - row_offset, + col_offset = column_split_points_[matrix_index][column_variable], + num_cols = column_split_points_[matrix_index][column_variable+1] - + col_offset; + return NnetComputation::SubMatrixInfo(matrix_index, row_offset, num_rows, + col_offset, num_cols); +} + /// given a vector of pairs from computation.indexes_multi_indexes /// containing paris (submatrix-index, row-index), this function outputs @@ -367,6 +384,14 @@ void ComputeCommandAttributes( vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr); break; } + case kCompressMatrix: { + vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr); + break; + } + case kDecompressMatrix: { + vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr); + break; + } case kAcceptInput: { vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr); break; @@ -555,6 +580,7 @@ void ComputationChecker::Check() { CheckComputationIndexes(); a_.Init(nnet_, computation_); CheckComputationMatrixAccesses(); + CheckComputationCompression(); CheckComputationUndefined(); CheckComputationDebugInfo(); if (config_.check_rewrite) @@ -608,16 +634,36 @@ void ComputationChecker::CheckComputationRewrite() const { Checks for the situation where a variable is read before being written. */ void ComputationChecker::CheckComputationUndefined() const { + // the variable 'min_proportion' needs to be <= the min_proportion_ value in + // class MatrixExtender, otherwise this code could spuriously reject a + // computation. + BaseFloat min_proportion = 0.8; + int32 num_variables = a_.variable_accesses.size(); for (int32 v = 0; v < num_variables; v++) { const std::vector &accesses = a_.variable_accesses[v]; if (accesses.empty()) { if (config_.check_unused_variables) { + NnetComputation::SubMatrixInfo info = a_.variables.VariableInfo(v); + const NnetComputation::MatrixInfo &matrix_info = + computation_.matrices[info.matrix_index]; + // Before we throw an error, we want to check that it isn't a case that + // can be produced by the ExtendMatrices() optimization, that is + // actually allowed. This is a case when a variable is inside the last + // few rows of a matrix, but not all columns of those last rows. + if (info.row_offset >= min_proportion * matrix_info.num_rows && + !(info.col_offset == 0 && info.num_cols == matrix_info.num_cols)) { + continue; + } KALDI_ERR << "Variable " << v << " == " << a_.variables.DescribeVariable(v) << " is never used."; } } else { - if (accesses[0].access_type != kWriteAccess) + // It's OK if part of a matrix is compressed, that is undefined; + // likely that part won't be referred to when we uncompress. + if (accesses[0].access_type != kWriteAccess && + !(computation_.commands[accesses[0].command_index].command_type == + kCompressMatrix)) KALDI_ERR << "Variable " << v << " == " << a_.variables.DescribeVariable(v) << " is read before it is written to"; @@ -647,9 +693,10 @@ void ComputationChecker::CheckComputationMatrixAccesses() const { KALDI_ERR << "Matrix m" << matrix_index << " is accessed before " "it is initialized"; } - if (accesses.accesses.size() == 1) { + if (accesses.accesses.size() == 1 && config_.check_unused_variables) { int32 first_access_command = accesses.accesses[0].command_index; if (computation_.commands[first_access_command].command_type == kSetConst) { + if (!config_.check_unused_variables) KALDI_ERR << "Matrix m" << matrix_index << " is only set to a constant " << "value, but then never accessed."; } @@ -678,6 +725,64 @@ void ComputationChecker::CheckComputationMatrixAccesses() const { } } +void ComputationChecker::CheckComputationCompression() const { + int32 num_matrices = a_.matrix_accesses.size(); + + // 'middle_command' will be the index of the command that separates + // the forward and backward passes. + int32 middle_command = -1; + for (size_t i = 0; i < computation_.commands.size(); i++) { + if (computation_.commands[i].command_type == kNoOperationMarker) { + middle_command = static_cast(i); + break; + } + } + for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) { + const MatrixAccesses &accesses = a_.matrix_accesses[matrix_index]; + int32 num_accesses = accesses.accesses.size(); + for (int32 a = 0; a < num_accesses; a++) { + const Access &access = accesses.accesses[a]; + int32 command_index = access.command_index; + const NnetComputation::Command &command = + computation_.commands[command_index]; + if (command.command_type == kDecompressMatrix) { + // check that the previous access to this matrix was a compression + // command. + KALDI_ASSERT( + a > 0 && computation_.commands[ + accesses.accesses[a-1].command_index].command_type == + kCompressMatrix); + } + if (command.command_type == kCompressMatrix) { + // check that the next access to this matrix is an uncompression + // command. + int32 next_command_index = accesses.accesses[a+1].command_index; + KALDI_ASSERT(computation_.commands[next_command_index].command_type == + kDecompressMatrix && + command_index < middle_command && + next_command_index > middle_command); + if (command.alpha == 0.0) { + // alpha == 0.0 means we're only retaining the sign; we should + // only do this if this is the output of a ReLU. + // make sure there are only 2 commands after this: the uncompress + // command, and a relu backprop command. (Any deallocation + // command doesn't show up in the list of 'accesses'). + KALDI_ASSERT(a > 0 && command.arg2 == kCompressedMatrixUint8 && + num_accesses == a + 3); + // make sure the next access to that matrix, apart from the + // uncompression command, is a ReLU propagation. + int32 next_command_index = accesses.accesses[a+2].command_index; + const NnetComputation::Command &next_command = + computation_.commands[next_command_index]; + KALDI_ASSERT(next_command.command_type == kBackprop && + nnet_.GetComponent(next_command.arg1)->Type() == + "RectifiedLinearComponent"); + } + } + } + } +} + /** This very basic check just makes sure that all indexes in the commands are within range, that dimensions agree with the request, that row/column dimensions @@ -930,6 +1035,26 @@ void ComputationChecker::CheckComputationIndexes() const { } break; } + case kCompressMatrix: { + if (c.arg1 < 1 || c.arg1 >= num_submatrices || + !computation_.IsWholeMatrix(c.arg1)) + KALDI_ERR << "submatrix index out of range or invalid"; + if (c.arg2 < static_cast(kCompressedMatrixInt8) || + c.arg2 > static_cast(kCompressedMatrixUint16)) + KALDI_ERR << "Invalid compressed-matrix type."; + if (c.arg3 != 0 && c.arg3 != 1) + KALDI_ERR << "Invalid 'truncate' option for compressing matrix."; + if (c.alpha < 0.0 || c.alpha > 1000.0 || + (c.alpha == 0.0 && c.arg2 != kCompressedMatrixUint8)) + KALDI_ERR << "Invalid alpha in kCompressMatrix command."; + break; + } + case kDecompressMatrix: { + if (c.arg1 < 1 || c.arg1 >= num_submatrices || + !computation_.IsWholeMatrix(c.arg1)) + KALDI_ERR << "submatrix index out of range or invalid"; + break; + } case kAcceptInput: case kProvideOutput: { if (c.arg1 < 1 || c.arg1 >= num_submatrices || !computation_.IsWholeMatrix(c.arg1)) @@ -1081,6 +1206,23 @@ int32 ComputationAnalysis::FirstNontrivialAccess(int32 s) const { } +int32 ComputationAnalysis::FirstAccess(int32 s) const { + KALDI_ASSERT(static_cast(s) < computation_.submatrices.size() && s>0); + int32 ans = computation_.commands.size(); + std::vector variable_indexes; + analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes); + std::vector::const_iterator iter = variable_indexes.begin(), + end = variable_indexes.end(); + for (; iter != end; ++iter) { + int32 v = *iter; + const std::vector &accesses = analyzer_.variable_accesses[v]; + if (!accesses.empty()) + ans = std::min(ans, accesses[0].command_index); + } + return ans; +} + + int32 ComputationAnalysis::FirstNontrivialMatrixAccess(int32 m) const { KALDI_ASSERT(static_cast(m) < computation_.matrices.size() && m > 0); int32 ans = computation_.commands.size(); @@ -1301,13 +1443,20 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) { num_submatrices = computation.submatrices.size(); for (int32 command_index = 0; command_index < num_commands; ++command_index) { const NnetComputation::Command &c = computation.commands[command_index]; - int64 this_num_bytes = -100000000; + int64 this_num_bytes = -100000000, + this_compressed_num_bytes = -10000000; if (c.arg1 >= 0 && c.arg1 < num_submatrices) { // if arg1 could plausibly be a sub-matrix index... const NnetComputation::SubMatrixInfo &submat_info = computation.submatrices[c.arg1]; this_num_bytes = static_cast(sizeof(BaseFloat)) * submat_info.num_rows * submat_info.num_cols; + + this_compressed_num_bytes = + ((c.arg2 == static_cast(kCompressedMatrixInt8) || + c.arg2 == static_cast(kCompressedMatrixUint8)) ? + 1 : 2) * static_cast(submat_info.num_rows) * + submat_info.num_cols; } switch (c.command_type) { case kAllocMatrix: @@ -1317,6 +1466,12 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) { case kDeallocMatrix: cur_memory_use -= this_num_bytes; break; + case kCompressMatrix: + cur_memory_use += this_compressed_num_bytes - this_num_bytes; + break; + case kDecompressMatrix: + cur_memory_use += this_num_bytes - this_compressed_num_bytes; + break; default: break; } diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h index 259a4546d53..77466039756 100644 --- a/src/nnet3/nnet-analyze.h +++ b/src/nnet3/nnet-analyze.h @@ -160,6 +160,7 @@ class ComputationVariables { // zero indexing): something like "m1" or "m1(0:99,:)" or "m1(0:19,10:49)" std::string DescribeVariable(int32 variable) const; + NnetComputation::SubMatrixInfo VariableInfo(int32 variable) const; private: // sets up split_points_, matrix_to_variable_index_, and num_variables_. // called from constructor. @@ -321,6 +322,13 @@ class ComputationAnalysis { /// s must be >0 (i.e. not the empty submatrix). int32 FirstNontrivialAccess(int32 s) const; + /// Returns the first command (read or write) that accesses any part of 's', + /// including possibly zeroing it. [note: kAllocMatrix, kSwapMatrix and + /// kDeallocMatrix do not count as read or write operations]. If there is no + /// such command, it returns num_commands. s must be >0 (i.e. not the empty + /// submatrix). + int32 FirstAccess(int32 s) const; + /// Returns the last non-deallocation command that accesses any part of /// submatrix 's'; if there is no such command it returns -1. /// s must be >0 (i.e. not the empty submatrix). @@ -385,7 +393,7 @@ struct CheckComputationOptions { // legitimately fail after optimization. see code for details. bool check_rewrite; // If 'check_unused_variables' is true, it checks for unused variables - // (e.g. unused partsof matrices). We only set it false for online + // (e.g. unused parts of matrices). We only set it false for online // computations, where there can be instances where a part of a matrix is // apparently never accessed (until we consider that the matrix is swapped // with another). @@ -407,15 +415,17 @@ class ComputationChecker { const NnetComputation &computation); void Check(); // call this only once. private: - // various dimension consistency checks and checks on properties. + // Various dimension consistency checks and checks on properties. void CheckComputationIndexes() const; - // checks for a situation where an undefined variable is read. + // Checks for a situation where an undefined variable is read. void CheckComputationUndefined() const; - // checks that all writes are done before reads. details with implementation. + // Checks that all writes are done before reads. details with implementation. void CheckComputationRewrite() const; - // check matrix accesses make sense. + // Check matrix accesses make sense. void CheckComputationMatrixAccesses() const; - // check debug_info has the correct size, if used. + // Some checks related to the kCompressMatrix and kDecompressMatrix commands. + void CheckComputationCompression() const; + // Check debug_info has the correct size, if used. void CheckComputationDebugInfo() const; const CheckComputationOptions &config_; diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 3e6d8599382..2080c60077b 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -95,8 +95,11 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, const NnetComputation &computation) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. This is mainly important for memory-norm. NnetComputer computer(nnet_config.compute_config, computation, - *nnet_, delta_nnet_); + nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.inputs); computer.Run(); @@ -120,6 +123,10 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, // happens when we use the model with batchnorm test-mode set). ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + // Scale delta_nnet if (success) ScaleNnet(nnet_config.momentum, delta_nnet_); @@ -131,8 +138,11 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, const NnetComputation &computation, bool is_backstitch_step1) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. This is mainly important for memory-norm. NnetComputer computer(nnet_config.compute_config, computation, - *nnet_, delta_nnet_); + nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.inputs); computer.Run(); @@ -168,6 +178,21 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, nnet_config.max_param_change, max_change_scale, scale_adding, nnet_, &num_max_change_per_component_applied_, &num_max_change_global_applied_); + if (is_backstitch_step1) { + // The following will only do something if we have a LinearComponent or + // AffineComponent with orthonormal-constraint set to a nonzero value. We + // choose to do this only on the 1st backstitch step, for efficiency. + ConstrainOrthonormal(nnet_); + } + + if (!is_backstitch_step1) { + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). Do this + // after backstitch step 2 so that the stats are scaled down before we start + // the next minibatch. + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + } + ScaleNnet(0.0, delta_nnet_); } @@ -196,9 +221,6 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, bool use_xent = (opts_.chain_config.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". CuMatrix xent_deriv; - if (use_xent) - xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), - kUndefined); BaseFloat tot_objf, tot_l2_term, tot_weight; diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc index 75350d3d8f6..31ff9819dfa 100644 --- a/src/nnet3/nnet-common.cc +++ b/src/nnet3/nnet-common.cc @@ -440,6 +440,11 @@ void PrintIndexes(std::ostream &os, os << "[ ]"; return; } + // If the string is longer than 'max_string_length' characters, it will + // be summarized with '...' in the middle. + size_t max_string_length = 200; + std::ostringstream os_temp; + // range_starts will be the starts of ranges (with consecutive t values and // the same n value and zero x values) that we compactly print. we'll append // "end" to range_starts for convenience.n @@ -457,23 +462,32 @@ void PrintIndexes(std::ostream &os, } range_starts.push_back(cur_start); range_starts.push_back(end); - os << "["; + os_temp << "["; int32 num_ranges = range_starts.size() - 1; for (int32 r = 0; r < num_ranges; r++) { int32 range_start = range_starts[r], range_end = range_starts[r+1]; KALDI_ASSERT(range_end > range_start); - os << "(" << indexes[range_start].n << ","; + os_temp << "(" << indexes[range_start].n << ","; if (range_end == range_start + 1) - os << indexes[range_start].t; + os_temp << indexes[range_start].t; else - os << indexes[range_start].t << ":" << indexes[range_end - 1].t; + os_temp << indexes[range_start].t << ":" << indexes[range_end - 1].t; if (indexes[range_start].x != 0) - os << "," << indexes[range_start].x; - os << ")"; + os_temp << "," << indexes[range_start].x; + os_temp << ")"; if (r + 1 < num_ranges) - os << ", "; + os_temp << ", "; + } + os_temp << "]"; + + std::string str = os_temp.str(); + if (str.size() <= max_string_length) { + os << str; + } else { + size_t len = str.size(); + os << str.substr(0, max_string_length / 2) << " ... " + << str.substr(len - max_string_length / 2); } - os << "]"; } void PrintCindexes(std::ostream &ostream, diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc index fa8a2322e5a..1a5ceabab0e 100644 --- a/src/nnet3/nnet-compile-looped.cc +++ b/src/nnet3/nnet-compile-looped.cc @@ -357,7 +357,6 @@ void CreateLoopedComputationRequestSimple(const Nnet &nnet, ComputationRequest *request1, ComputationRequest *request2, ComputationRequest *request3) { - bool has_ivector = (nnet.InputDim("ivector") > 0); int32 left_context, right_context; ComputeSimpleNnetContext(nnet, &left_context, &right_context); diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 82010fea58d..ce4bbd0940a 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -23,6 +23,7 @@ #include #include "nnet3/nnet-component-itf.h" #include "nnet3/nnet-simple-component.h" +#include "nnet3/nnet-normalize-component.h" #include "nnet3/nnet-general-component.h" #include "nnet3/nnet-convolutional-component.h" #include "nnet3/nnet-attention-component.h" @@ -331,24 +332,23 @@ std::string UpdatableComponent::Info() const { void NonlinearComponent::StoreStatsInternal( const CuMatrixBase &out_value, const CuMatrixBase *deriv) { - KALDI_ASSERT(out_value.NumCols() == InputDim()); + KALDI_ASSERT(out_value.NumCols() == dim_); // Check we have the correct dimensions. - if (value_sum_.Dim() != InputDim() || - (deriv != NULL && deriv_sum_.Dim() != InputDim())) { - std::lock_guard lock(mutex_); - if (value_sum_.Dim() != InputDim()) { - value_sum_.Resize(InputDim()); + if (value_sum_.Dim() != dim_ || + (deriv != NULL && deriv_sum_.Dim() != dim_)) { + if (value_sum_.Dim() != dim_) { + value_sum_.Resize(dim_); count_ = 0.0; } - if (deriv != NULL && deriv_sum_.Dim() != InputDim()) { - deriv_sum_.Resize(InputDim()); + if (deriv != NULL && deriv_sum_.Dim() != dim_) { + deriv_sum_.Resize(dim_); count_ = 0.0; value_sum_.SetZero(); } } count_ += out_value.NumRows(); - CuVector temp(InputDim()); + CuVector temp(dim_); temp.AddRowSumMat(1.0, out_value, 0.0); value_sum_.AddVec(1.0, temp); if (deriv != NULL) { @@ -357,22 +357,39 @@ void NonlinearComponent::StoreStatsInternal( } } +void NonlinearComponent::StoreBackpropStats( + const CuMatrixBase &out_deriv) { + // only store these stats about every 4 minibatches. + if (RandInt(0, 3) == 0) + return; + + KALDI_ASSERT(out_deriv.NumCols() == dim_); + + // Check we have the correct dimensions. + if (oderiv_sumsq_.Dim() != dim_) { + oderiv_sumsq_.Resize(dim_); + oderiv_count_ = 0.0; + } + CuVector temp(dim_); + temp.AddDiagMat2(1.0, out_deriv, kTrans, 0.0); + oderiv_sumsq_.AddVec(1.0, temp); + oderiv_count_ += out_deriv.NumRows(); +} + + void NonlinearComponent::ZeroStats() { value_sum_.SetZero(); deriv_sum_.SetZero(); + oderiv_sumsq_.SetZero(); count_ = 0.0; + oderiv_count_ = 0.0; num_dims_self_repaired_ = 0.0; num_dims_processed_ = 0.0; } std::string NonlinearComponent::Info() const { std::stringstream stream; - if (InputDim() == OutputDim()) { - stream << Type() << ", dim=" << InputDim(); - } else { - stream << Type() << ", input-dim=" << InputDim() - << ", output-dim=" << OutputDim(); - } + stream << Type() << ", dim=" << dim_; if (block_dim_ != dim_) stream << ", block-dim=" << block_dim_; if (self_repair_lower_threshold_ != BaseFloat(kUnsetThreshold)) @@ -392,19 +409,30 @@ std::string NonlinearComponent::Info() const { value_avg.Scale(1.0 / count_); stream << ", value-avg=" << SummarizeVector(value_avg); if (deriv_sum_.Dim() == dim_) { - Vector deriv_avg_dbl(deriv_sum_); - Vector deriv_avg(deriv_avg_dbl); + Vector deriv_avg(deriv_sum_); deriv_avg.Scale(1.0 / count_); stream << ", deriv-avg=" << SummarizeVector(deriv_avg); } } + if (oderiv_count_ > 0 && oderiv_sumsq_.Dim() == dim_) { + Vector oderiv_rms(oderiv_sumsq_); + oderiv_rms.Scale(1.0 / oderiv_count_); + // The ApplyMin() is so that the statement after it does not fail even if we + // had subtracted models (e.g. in full_progress.*.log). + oderiv_rms.ApplyFloor(0.0); + oderiv_rms.ApplyPow(0.5); + stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms) + << ", oderiv-count=" << oderiv_count_; + } return stream.str(); } void NonlinearComponent::Scale(BaseFloat scale) { value_sum_.Scale(scale); deriv_sum_.Scale(scale); + oderiv_sumsq_.Scale(scale); count_ *= scale; + oderiv_count_ *= scale; num_dims_self_repaired_ *= scale; num_dims_processed_ *= scale; } @@ -417,11 +445,16 @@ void NonlinearComponent::Add(BaseFloat alpha, const Component &other_in) { value_sum_.Resize(other->value_sum_.Dim()); if (deriv_sum_.Dim() == 0 && other->deriv_sum_.Dim() != 0) deriv_sum_.Resize(other->deriv_sum_.Dim()); + if (oderiv_sumsq_.Dim() == 0 && other->oderiv_sumsq_.Dim() != 0) + oderiv_sumsq_.Resize(other->oderiv_sumsq_.Dim()); if (other->value_sum_.Dim() != 0) value_sum_.AddVec(alpha, other->value_sum_); if (other->deriv_sum_.Dim() != 0) deriv_sum_.AddVec(alpha, other->deriv_sum_); + if (other->oderiv_sumsq_.Dim() != 0) + oderiv_sumsq_.AddVec(alpha, other->oderiv_sumsq_); count_ += alpha * other->count_; + oderiv_count_ += alpha * other->oderiv_count_; num_dims_self_repaired_ += alpha * other->num_dims_self_repaired_; num_dims_processed_ += alpha * other->num_dims_processed_; } @@ -444,11 +477,27 @@ void NonlinearComponent::Read(std::istream &is, bool binary) { deriv_sum_.Read(is, binary); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &count_); + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + oderiv_sumsq_.Read(is, binary); + oderiv_sumsq_.ApplyPow(2.0); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &oderiv_count_); + } else { + oderiv_count_ = 0.0; + oderiv_sumsq_.Resize(0); + } value_sum_.Scale(count_); deriv_sum_.Scale(count_); + oderiv_sumsq_.Scale(oderiv_count_); std::string token; ReadToken(is, binary, &token); + if (token[0] != '<') { + // this should happen only rarely, in case we couldn't push back the + // '<' to the stream in PeekToken(). + token = '<' + token; + } if (token == "") { ReadBasicType(is, binary, &num_dims_self_repaired_); ReadToken(is, binary, &token); @@ -492,14 +541,29 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const { Vector temp(value_sum_); if (count_ != 0.0) temp.Scale(1.0 / count_); temp.Write(os, binary); - WriteToken(os, binary, ""); - temp.Resize(deriv_sum_.Dim(), kUndefined); + WriteToken(os, binary, ""); + temp.Resize(deriv_sum_.Dim()); temp.CopyFromVec(deriv_sum_); if (count_ != 0.0) temp.Scale(1.0 / count_); temp.Write(os, binary); + WriteToken(os, binary, ""); WriteBasicType(os, binary, count_); + + WriteToken(os, binary, ""); + temp.Resize(oderiv_sumsq_.Dim()); + temp.CopyFromVec(oderiv_sumsq_); + if (oderiv_count_ != 0.0) temp.Scale(1.0 / oderiv_count_); + // The ApplyMin() is so that the statement after it does not fail even if we + // had subtracted models (e.g. in full_progress.*.log). + temp.ApplyFloor(0.0); + temp.ApplyPow(0.5); + temp.Write(os, binary); + + WriteToken(os, binary, ""); + WriteBasicType(os, binary, oderiv_count_); + WriteToken(os, binary, ""); WriteBasicType(os, binary, num_dims_self_repaired_); WriteToken(os, binary, ""); @@ -520,7 +584,7 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const { } NonlinearComponent::NonlinearComponent(): - dim_(-1), block_dim_(-1), count_(0.0), + dim_(-1), block_dim_(-1), count_(0.0), oderiv_count_(0.0), num_dims_self_repaired_(0.0), num_dims_processed_(0.0), self_repair_lower_threshold_(kUnsetThreshold), self_repair_upper_threshold_(kUnsetThreshold), @@ -529,7 +593,8 @@ NonlinearComponent::NonlinearComponent(): NonlinearComponent::NonlinearComponent(const NonlinearComponent &other): dim_(other.dim_), block_dim_(other.block_dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_), - count_(other.count_), + count_(other.count_), oderiv_sumsq_(other.oderiv_sumsq_), + oderiv_count_(other.oderiv_count_), num_dims_self_repaired_(other.num_dims_self_repaired_), num_dims_processed_(other.num_dims_processed_), self_repair_lower_threshold_(other.self_repair_lower_threshold_), diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 62e09cee80f..c34d550d681 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -351,20 +351,23 @@ class Component { /// although most components will have much more info. virtual std::string Info() const; - /// This virtual function when called by - // -- an UpdatableComponent scales the parameters + /// This virtual function when called on + /// -- an UpdatableComponent scales the parameters /// by "scale" when called by an UpdatableComponent. - // -- a Nonlinear component (or another component that - /// stores stats, like BatchNormComponent-- it relates + /// -- a Nonlinear component (or another component that + /// stores stats, like BatchNormComponent)-- it relates /// to scaling activation stats, not parameters. + /// Otherwise it will normally do nothing. virtual void Scale(BaseFloat scale) {}; /// This virtual function when called by /// -- an UpdatableComponent adds the parameters of /// another updatable component, times some constant, to the current /// parameters. - /// -- a NonlinearComponent it relates to adding stats - /// Otherwise it should do nothing. + /// -- a NonlinearComponent (or another component that stores + /// stats, like BatchNormComponent)-- it relates to adding + /// stats. + /// Otherwise it will normally do nothing. virtual void Add(BaseFloat alpha, const Component &other) {}; /// This virtual function only needs to be overwritten by Components that @@ -587,7 +590,7 @@ class UpdatableComponent: public Component { block-dim Defaults to dim, but may be any nonzero divisor of dim. It affects the self-repair, which will be done while treating the input/output as - repeating blocks of size 'block-dim' (e.g. blocks of filtes). It allows + repeating blocks of size 'block-dim' (e.g. blocks of filters). It allows us to do self-repair on the filter level in CNNs. Currently this only makes a difference for RectifiedLinearComponent. */ @@ -640,6 +643,10 @@ class NonlinearComponent: public Component { void StoreStatsInternal(const CuMatrixBase &out_value, const CuMatrixBase *deriv = NULL); + // This function may be called from child class members during backprop. It + // stores the 'oderiv_sumsq_' stats. + void StoreBackpropStats(const CuMatrixBase &out_deriv); + const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow. @@ -655,8 +662,16 @@ class NonlinearComponent: public Component { CuVector deriv_sum_; // stats of the derivative of the nonlinearity // (only applicable to element-by-element // nonlinearities, not Softmax. + // Count corresponding to the stats in 'value_sum_' and 'deriv_sum_' double count_; + CuVector oderiv_sumsq_; // Sum-square of the derivative of the + // objective function, that we're propagating + // back. Accumulated during the backprop; + // used for diagnostics. + // Count corresponding to the stats in 'oderiv_sumsq_'. + double oderiv_count_; + // some stats for self-repairing nonlinearities. double num_dims_self_repaired_; double num_dims_processed_; @@ -665,9 +680,6 @@ class NonlinearComponent: public Component { BaseFloat self_repair_lower_threshold_; BaseFloat self_repair_upper_threshold_; BaseFloat self_repair_scale_; - - // The mutex is used in UpdateStats, only for resizing vectors. - std::mutex mutex_; }; } // namespace nnet3 diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc index 77facbdba79..a9a21bb3f24 100644 --- a/src/nnet3/nnet-computation.cc +++ b/src/nnet3/nnet-computation.cc @@ -282,6 +282,10 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) { command_type = kAddToRowsMulti; } else if (command_type_str == "kAddRowRanges") { command_type = kAddRowRanges; + } else if (command_type_str == "kCompressMatrix") { + command_type = kCompressMatrix; + } else if (command_type_str == "kDecompressMatrix") { + command_type = kDecompressMatrix; } else if (command_type_str == "kAcceptInput") { command_type = kAcceptInput; } else if (command_type_str == "kProvideOutput") { @@ -375,6 +379,12 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const { case kAddRowRanges: os << "kAddRowRanges\n"; break; + case kCompressMatrix: + os << "kCompressMatrix\n"; + break; + case kDecompressMatrix: + os << "kDecompressMatrix\n"; + break; case kAcceptInput: os << "kAcceptInput\n"; break; @@ -500,13 +510,17 @@ static void GetIndexesMultiStrings( // writes to "os" the statement for this command. -static void PrintCommand(std::ostream &os, +static void PrintCommand(std::ostream &os_out, const Nnet &nnet, const NnetComputation &computation, int32 command_index, const std::vector &submatrix_strings, const std::vector &indexes_strings, const std::vector &indexes_multi_strings) { + // If the string is longer than 'max_string_length' characters, it will + // be summarized with '...' in the middle. + size_t max_string_length = 200; + std::ostringstream os; KALDI_ASSERT(command_index < computation.commands.size()); os << "c" << command_index << ": "; const NnetComputation::Command &c = computation.commands[command_index]; @@ -611,6 +625,25 @@ static void PrintCommand(std::ostream &os, os << "])\n"; break; } + case kCompressMatrix: { + BaseFloat range = c.alpha; + std::string truncate = (c.arg3 != 0 ? "true" : "false"); + std::string compressed_matrix_type; + if (c.arg2 == kCompressedMatrixInt8) { compressed_matrix_type = "int8"; } + else if (c.arg2 == kCompressedMatrixUint8) { compressed_matrix_type = "uint8"; } + else if (c.arg2 == kCompressedMatrixInt16) { compressed_matrix_type = "int16"; } + else { + KALDI_ASSERT(c.arg2 == kCompressedMatrixInt16); + compressed_matrix_type = "uint16"; + } + os << "CompressMatrix(" << submatrix_strings[c.arg1] + << range << ", " << compressed_matrix_type << ", " + << truncate << ")\n"; + break; + } + case kDecompressMatrix: + os << "DecompressMatrix(" << submatrix_strings[c.arg1] << ")\n"; + break; case kAcceptInput: os << submatrix_strings[c.arg1] << " = user input [for node: '" << nnet.GetNodeName(c.arg2) << "']\n"; @@ -637,6 +670,14 @@ static void PrintCommand(std::ostream &os, default: KALDI_ERR << "Un-handled command type."; } + std::string str = os.str(); + if (str.size() <= max_string_length) { + os_out << str; + } else { + size_t len = str.size(); + os_out << str.substr(0, max_string_length / 2) << " ... " + << str.substr(len - max_string_length / 2); + } } @@ -689,7 +730,7 @@ void NnetComputation::Print(std::ostream &os, const Nnet &nnet) const { } void NnetComputation::Read(std::istream &is, bool binary) { - int32 version = 4, // must be in sync with 'version' in Write. + int32 version = 5, // must be in sync with 'version' in Write. version_in = 1; // defaults to 1 if no version specified. ExpectToken(is, binary, ""); @@ -823,7 +864,7 @@ void NnetComputation::Read(std::istream &is, bool binary) { } void NnetComputation::Write(std::ostream &os, bool binary) const { - int32 version = 4; // Must be in sync with version in Read. + int32 version = 5; // Must be in sync with version in Read. WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, version); diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h index 4b1386a1f01..d056a71498c 100644 --- a/src/nnet3/nnet-computation.h +++ b/src/nnet3/nnet-computation.h @@ -232,6 +232,17 @@ struct ComputationRequest { indexes_ranges[arg3]. We use the "alpha" as if AddRowRanges() accepted that argument, even though it doesn't (we fake it using other calls, if alpha != 1.0). + - kCompressMatrix: Compresses the matrix which should be referred to + by submatrix-index arg1. arg2 is a number that determines the + compression type (it's converted from the enum + CuCompressedMatrixType; 1=int8, 2=uint8, 3=int16, 4=uint16), and alpha + determines the 'range' parameter (c.f. NewCuCompressedMatrix()). arg3 + will be converted to the 'truncate' argument to the class + CuCompressedMatrix; it should be false (0) if you know that the input is + limited to the allowed range, and true (1) if the input may exceed that + range (see docs for CuCompresedMatrix). + - kDecompressMatrix: Decompresses the matrix which is referred to + by submatrix-index arg1 (it should previously have been compressed). - kAcceptInput: accepts a matrix of input from the user, which may be either features, or derivatives w.r.t. the output. arg1 is the submatrix index of a whole matrix that the input goes to, and arg2 is the index of the network @@ -263,7 +274,8 @@ enum CommandType { kPropagate, kBackprop, kBackpropNoModelUpdate, kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows, kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti, - kAddRowRanges, kAcceptInput, kProvideOutput, + kAddRowRanges, kCompressMatrix, kDecompressMatrix, + kAcceptInput, kProvideOutput, kNoOperation, kNoOperationPermanent, kNoOperationMarker, kNoOperationLabel, kGotoLabel }; diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index 87fa62c6112..19eecdda72b 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -30,22 +30,37 @@ NnetComputer::NnetComputer(const NnetComputeOptions &options, const Nnet &nnet, Nnet *nnet_to_update): options_(options), computation_(computation), nnet_(nnet), - program_counter_(0), nnet_to_update_(nnet_to_update) { - KALDI_ASSERT(computation.indexes_cuda.size() == computation.indexes.size() && - computation.indexes_ranges_cuda.size() == computation.indexes_ranges.size() && + program_counter_(0), nnet_to_store_stats_(nnet_to_update), + nnet_to_update_(nnet_to_update) { + Init(); +} + +NnetComputer::NnetComputer(const NnetComputeOptions &options, + const NnetComputation &computation, + Nnet *nnet, + Nnet *nnet_to_update): + options_(options), computation_(computation), nnet_(*nnet), + program_counter_(0), nnet_to_store_stats_(nnet), + nnet_to_update_(nnet_to_update) { + Init(); +} + +void NnetComputer::Init() { + KALDI_ASSERT(computation_.indexes_cuda.size() == computation_.indexes.size() && + computation_.indexes_ranges_cuda.size() == computation_.indexes_ranges.size() && "You must call NnetComputation::ComputeCudaIndexes() before " "executing the computation."); - matrices_.resize(computation.matrices.size()); + matrices_.resize(computation_.matrices.size()); debug_ = (options_.debug || GetVerboseLevel() >= 5); if (debug_) { ComputationVariables variables; - variables.Init(computation); - ComputeCommandAttributes(nnet, computation, variables, + variables.Init(computation_); + ComputeCommandAttributes(nnet_, computation_, variables, &command_attributes_); std::string preamble; - computation.GetCommandStrings(nnet, &preamble, &command_strings_); + computation_.GetCommandStrings(nnet_, &preamble, &command_strings_); KALDI_LOG << preamble; - computation.GetSubmatrixStrings(nnet, &submatrix_strings_); + computation_.GetSubmatrixStrings(nnet_, &submatrix_strings_); } } @@ -177,6 +192,7 @@ NnetComputer::NnetComputer(const NnetComputer &other): nnet_(other.nnet_), program_counter_(other.program_counter_), pending_commands_(other.pending_commands_), + nnet_to_store_stats_(other.nnet_to_store_stats_), nnet_to_update_(other.nnet_to_update_), debug_(other.debug_), command_attributes_(other.command_attributes_), @@ -226,14 +242,14 @@ void NnetComputer::ExecuteCommand() { CuSubMatrix output(GetSubMatrix(c.arg4)); void *memo = component->Propagate(indexes, input, &output); if (c.arg6) { // need to store stats. - KALDI_ASSERT(nnet_to_update_ != NULL); - Component *upd_component = nnet_to_update_->GetComponent(c.arg1); + KALDI_ASSERT(nnet_to_store_stats_ != NULL); + Component *stats_component = nnet_to_store_stats_->GetComponent(c.arg1); bool was_in_place = (c.arg3 == c.arg4); // if propagate was in-place, provide empty matrix and not 'input', as // input is no longer valid. const CuSubMatrix maybe_input( GetSubMatrix(was_in_place ? 0 : c.arg3)); - upd_component->StoreStats(maybe_input, output, memo); + stats_component->StoreStats(maybe_input, output, memo); } SaveMemo(c.arg5, *component, memo); break; @@ -245,11 +261,21 @@ void NnetComputer::ExecuteCommand() { debug_str << nnet_.GetComponentName(c.arg1); const Component *component = nnet_.GetComponent(c.arg1); KALDI_ASSERT(!(computation_.need_model_derivative && !nnet_to_update_)); - Component *upd_component = (nnet_to_update_ && - c.command_type == kBackprop && - computation_.need_model_derivative ? - nnet_to_update_->GetComponent(c.arg1) : - NULL); + Component *upd_component = NULL; + if (c.command_type == kBackprop) { // this block sets 'upd_component' + Nnet *nnet_to_update; + if (component->Properties()&kUpdatableComponent) { + nnet_to_update = (computation_.need_model_derivative ? + nnet_to_update_ : NULL); + } else { + // Some non-updatable components, such as CompositeComponent, store + // stats in the backprop. For other types of non-updatable + // component, this arg won't matter. + nnet_to_update = nnet_to_store_stats_; + } + if (nnet_to_update) + upd_component = nnet_to_update->GetComponent(c.arg1); + } ComponentPrecomputedIndexes *indexes = computation_.component_precomputed_indexes[c.arg2].data; const CuSubMatrix in_value(GetSubMatrix(c.arg3)); @@ -356,6 +382,42 @@ void NnetComputer::ExecuteCommand() { } break; } + case kCompressMatrix: + // This does nothing if CUDA is not in use. +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + if (compressed_matrices_.empty()) + compressed_matrices_.resize(matrices_.size(), NULL); + int32 m = computation_.submatrices[c.arg1].matrix_index; + KALDI_ASSERT(compressed_matrices_[m] == NULL && + matrices_[m].NumRows() != 0); + BaseFloat range = c.alpha; + bool truncate = (c.arg3 != 0); + compressed_matrices_[m] = NewCuCompressedMatrix( + static_cast(c.arg2), + range, truncate); + compressed_matrices_[m]->CopyFromMat(matrices_[m]); + matrices_[m].Resize(0, 0); + } + break; +#endif + case kDecompressMatrix: +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + int32 m = computation_.submatrices[c.arg1].matrix_index; + CuCompressedMatrixBase *compressed_matrix = + compressed_matrices_[m]; + KALDI_ASSERT(compressed_matrix != NULL && + matrices_[m].NumRows() == 0); + matrices_[m].Resize(compressed_matrix->NumRows(), + compressed_matrix->NumCols(), + kUndefined); + compressed_matrix->CopyToMat(&(matrices_[m])); + delete compressed_matrix; + compressed_matrices_[m] = NULL; + } +#endif + break; case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker: case kNoOperationLabel: break; @@ -609,5 +671,14 @@ void NnetComputer::AcceptInputs(const Nnet &nnet, } } +NnetComputer::~NnetComputer() { + // Delete any pointers that are present in compressed_matrices_. Actually + // they should all already have been deallocated and set to NULL if the + // compuation was run to completion; we do this in case someone ran + // the forward propagation but not the backprop. + for (size_t i = 0; i < compressed_matrices_.size(); i++) + delete compressed_matrices_[i]; +} + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h index e16cbfbb393..333ed3168b9 100644 --- a/src/nnet3/nnet-compute.h +++ b/src/nnet3/nnet-compute.h @@ -62,15 +62,29 @@ class NnetComputer { /// model update or model-derivative computation. /// You must call computation.ComputeCudaIndexes() before calling /// this function. + /// + /// Caution: there is another constructor that takes a pointer for + /// 'nnet', be careful not to mix these up. NnetComputer(const NnetComputeOptions &options, const NnetComputation &computation, const Nnet &nnet, Nnet *nnet_to_update); - /// Copy constructor. May not be used if memos are involved (memos are only - /// possible if backprop will take place, and in these situations you won't - /// normally be wanting to use the copy constructor anyway; the copy - /// constructor is more useful for things like RNNLM lattice rescoring). + /// This version of the constructor accepts a pointer to 'nnet' instead + /// of a const reference. The difference is that this version will, + /// for storing statistics (the StoreStats() function of class Component), + /// use 'nnet' instead of 'nnet_to_update' (if specified). + NnetComputer(const NnetComputeOptions &options, + const NnetComputation &computation, + Nnet *nnet, + Nnet *nnet_to_update); + + + /// Copy constructor. May not be used if memos are stored with this object + /// (which is only a possibility if backprop will take place, and in these + /// situations you won't normally be wanting to use the copy constructor + /// anyway; the copy constructor is more useful for things like RNNLM lattice + /// rescoring). NnetComputer(const NnetComputer &other); /// e.g. AcceptInput ("input", &input_mat), or for derivatives w.r.t. the @@ -111,10 +125,14 @@ class NnetComputer { CuMatrix *output); + ~NnetComputer(); private: + void Init(); // called from constructors. + const NnetComputeOptions &options_; const NnetComputation &computation_; const Nnet &nnet_; + int32 program_counter_; // command index to execute next. // To deal with inputs and outputs that are not provided/taken by the user in // the same order as listed in the computation, pending_commands_ contains a @@ -122,6 +140,13 @@ class NnetComputer { // executed. std::vector pending_commands_; + // A pointer to the copy of the nnet which we'll be using for stats + // accumulation (the StoreStats() function). May be NULL or the same + // as nnet_ or nnet_to_update_. + Nnet *nnet_to_store_stats_; + // A pointer to the copy of the nnet which we'll be updating the parameters + // of (nnet_to_update in the backprop function). May be NULL and usually + // will not be the same as nnet_. Nnet *nnet_to_update_; bool debug_; // command_attributes_ is only used if debug_=true. @@ -139,6 +164,14 @@ class NnetComputer { // NULL). std::vector memos_; + // This is only used when commands kCompressMatrix and kDecompressMatrix are + // invoked. It will be (the first time we compress a matrix) resized to be + // the same size as 'matrices_' (i.e., indexed by matrix index). When we + // compress a matrix m we set compressed_matrices_[m] to a non-NULL value and + // resize matrices_[m] to empty; and when we uncompress it, the reverse + // happens. + std::vector compressed_matrices_; + // executes the command in computation_.commands[program_counter_]. void ExecuteCommand(); @@ -207,7 +240,6 @@ class NnetComputer { // memos are not reusable. inline void *GetMemo(int32 memo_index); - private: NnetComputer &operator = (const NnetComputer &other); // Disallow. }; diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index f689984e876..bea3b9d31d5 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -263,18 +263,14 @@ void TimeHeightConvolutionComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("alpha-out", &alpha_out); cfl->GetValue("num-minibatches-history", &num_minibatches_history); - preconditioner_in_.SetAlpha(alpha_in); - preconditioner_out_.SetAlpha(alpha_out); int32 dim_in = linear_params_.NumCols() + 1, dim_out = linear_params_.NumRows(); - if (rank_in < 0) { + if (rank_in < 0) rank_in = std::min(80, (dim_in + 1) / 2); - preconditioner_in_.SetRank(rank_in); - } - if (rank_out < 0) { + preconditioner_in_.SetRank(rank_in); + if (rank_out < 0) rank_out = std::min(80, (dim_out + 1) / 2); - preconditioner_out_.SetRank(rank_out); - } + preconditioner_out_.SetRank(rank_out); preconditioner_in_.SetNumMinibatchesHistory(num_minibatches_history); preconditioner_out_.SetNumMinibatchesHistory(num_minibatches_history); @@ -360,29 +356,29 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient( const CuMatrixBase &in_value, const CuMatrixBase &out_deriv) { - CuVector bias_temp(bias_params_.Dim()); + CuVector bias_deriv(bias_params_.Dim()); - { // this block computes 'bias_temp', the derivative w.r.t. the bias. + { // this block computes 'bias_deriv', the derivative w.r.t. the bias. KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() && out_deriv.NumCols() == model_.height_out * model_.num_filters_out); CuSubMatrix out_deriv_reshaped( out_deriv.Data(), out_deriv.NumRows() * model_.height_out, model_.num_filters_out, model_.num_filters_out); - bias_temp.AddRowSumMat(1.0, out_deriv_reshaped); + bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped); } - CuMatrix params_temp(linear_params_.NumRows(), + CuMatrix params_deriv(linear_params_.NumRows(), linear_params_.NumCols() + 1); - params_temp.CopyColFromVec(bias_temp, linear_params_.NumCols()); + params_deriv.CopyColFromVec(bias_deriv, linear_params_.NumCols()); - CuSubMatrix linear_params_temp( - params_temp, 0, linear_params_.NumRows(), + CuSubMatrix linear_params_deriv( + params_deriv, 0, linear_params_.NumRows(), 0, linear_params_.NumCols()); ConvolveBackwardParams(indexes.computation, in_value, out_deriv, - 1.0, &linear_params_temp); + 1.0, &linear_params_deriv); // the precondition-directions code outputs a scalar that // must be multiplied by its output (this saves one @@ -393,22 +389,19 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient( // scalars are different across iterations, the scalars // will be pretty similar on different iterations BaseFloat scale1, scale2; - preconditioner_in_.PreconditionDirections(¶ms_temp, NULL, - &scale1); - + preconditioner_in_.PreconditionDirections(¶ms_deriv, &scale1); - CuMatrix params_temp_transpose(params_temp, kTrans); - preconditioner_out_.PreconditionDirections(¶ms_temp_transpose, - NULL, &scale2); + CuMatrix params_deriv_transpose(params_deriv, kTrans); + preconditioner_out_.PreconditionDirections(¶ms_deriv_transpose, &scale2); linear_params_.AddMat( learning_rate_ * scale1 * scale2, - params_temp_transpose.RowRange(0, linear_params_.NumCols()), + params_deriv_transpose.RowRange(0, linear_params_.NumCols()), kTrans); bias_params_.AddVec(learning_rate_ * scale1 * scale2, - params_temp_transpose.Row(linear_params_.NumCols())); + params_deriv_transpose.Row(linear_params_.NumCols())); } diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 6b90787ea95..dd6e950a7d1 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1252,7 +1252,7 @@ void ConstantComponent::Backprop( CuMatrix out_deriv_copy(out_deriv); BaseFloat scale = 1.0; to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, - NULL, &scale); + &scale); to_update->output_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } else { diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc new file mode 100644 index 00000000000..d10c6fabd36 --- /dev/null +++ b/src/nnet3/nnet-normalize-component.cc @@ -0,0 +1,680 @@ +// nnet3/nnet-normalize-component.cc + +// Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey) +// 2015 Guoguo Chen +// 2015 Daniel Galvez + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "nnet3/nnet-normalize-component.h" +#include "nnet3/nnet-parse.h" +#include "cudamatrix/cu-math.h" + +namespace kaldi { +namespace nnet3 { + +const BaseFloat NormalizeComponent::kSquaredNormFloor = + pow(2.0, NormalizeComponent::kExpSquaredNormFloor); + +NormalizeComponent::NormalizeComponent(const NormalizeComponent &other): + input_dim_(other.input_dim_), block_dim_(other.block_dim_), + target_rms_(other.target_rms_), + add_log_stddev_(other.add_log_stddev_) { } + +void NormalizeComponent::InitFromConfig(ConfigLine *cfl) { + input_dim_ = 0; + add_log_stddev_ = false; + target_rms_ = 1.0; + bool ok = cfl->GetValue("dim", &input_dim_) || + cfl->GetValue("input-dim", &input_dim_); + block_dim_ = input_dim_; + cfl->GetValue("block-dim", &block_dim_); + cfl->GetValue("target-rms", &target_rms_); + cfl->GetValue("add-log-stddev", &add_log_stddev_); + if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 || + block_dim_ <= 0 || input_dim_ % block_dim_ != 0) + KALDI_ERR << "Invalid initializer for layer of type " + << Type() << ": \"" << cfl->WholeLine() << "\""; +} + +void NormalizeComponent::Read(std::istream &is, bool binary) { + std::string token; + ReadToken(is, binary, &token); + if (token == "") { + ReadToken(is, binary, &token); + } + KALDI_ASSERT(token == "" || token == ""); + ReadBasicType(is, binary, &input_dim_); // Read dimension. + ReadToken(is, binary, &token); + if (token == "") { + ReadBasicType(is, binary, &block_dim_); + ReadToken(is, binary, &token); + } else { + block_dim_ = input_dim_; + } + // read target_rms_ if it is available. + if (token == "") { + ReadBasicType(is, binary, &target_rms_); + ReadToken(is, binary, &token); + } + // Read add_log_stddev_ token, if it is available. + if (token == "") { + ReadBasicType(is, binary, &add_log_stddev_); + ReadToken(is, binary, &token); + } else { + add_log_stddev_ = false; + } + if (token == "") { + // back-compatibility code. + CuVector temp; + temp.Read(is, binary); + ExpectToken(is, binary, ""); + temp.Read(is, binary); + ExpectToken(is, binary, ""); + double count; + ReadBasicType(is, binary, &count); + ReadToken(is, binary, &token); + } + KALDI_ASSERT(token == ""); +} + +void NormalizeComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_dim_); + if (block_dim_ != input_dim_) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, block_dim_); + } + WriteToken(os, binary, ""); + WriteBasicType(os, binary, target_rms_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, add_log_stddev_); + WriteToken(os, binary, ""); +} + +std::string NormalizeComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", input-dim=" << InputDim() + << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_ + << ", add-log-stddev=" << std::boolalpha << add_log_stddev_; + if (block_dim_ != input_dim_) + stream << ", block-dim=" << block_dim_; + return stream.str(); +} + +// The output y_i = scale * x_i, +// and we want to RMS value of the y_i to equal target_rms, +// so y^t y = D * target_rms^2 (if y is one row of the input). +// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). +// there is also flooring involved, to avoid division-by-zero +// problems. It's important for the backprop, that the floor's +// square root is exactly representable as float. +// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) +// is an extra dimension of the output. +void* NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() && + in.NumRows() == out->NumRows()); + if (block_dim_ != input_dim_) { + int32 num_blocks = input_dim_ / block_dim_, + new_num_rows = in.NumRows() * num_blocks, + output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0); + KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); + CuSubMatrix in_reshaped(in.Data(), new_num_rows, + block_dim_, block_dim_), + out_reshaped(out->Data(), new_num_rows, + output_block_dim, output_block_dim); + cu::NormalizePerRow(in_reshaped, target_rms_, add_log_stddev_, + &out_reshaped); + } else { + cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out); + } + return NULL; +} + +/* + A note on the derivative of NormalizeComponent... + let both row_in and row_out be vectors of dimension D. + Let p = row_in^T row_in / (D * target_rms^2), and let + f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as: + row_out = f row_in. + Suppose we have a quantity deriv_out which is the derivative + of the objective function w.r.t. row_out. We want to compute + deriv_in which is the derivative of the objective function w.r.t. + row_in. Let the objective function be F. One term is obvious: we have + deriv_in = f deriv_out + .... + next we have to take into account the derivative that gets back-propagated + through f. Obviously, dF/df = deriv_out^T row_in. + And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3), + and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued]. + So this term in dF/d(row_in) equals: + dF/df df/dp dp/d(row_in) = 2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in + So + deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / (D * target_rms^2) ) (deriv_out^T row_in) row_in + + if add_log_stddev_ true, the deriv_in has another term as + dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x) +*/ +void NormalizeComponent::Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + if (!in_deriv) + return; + if (block_dim_ != input_dim_) { + int32 num_blocks = input_dim_ / block_dim_, + new_num_rows = in_value.NumRows() * num_blocks, + output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0); + KALDI_ASSERT(in_value.Stride() == in_value.NumCols() && + out_deriv.Stride() == out_deriv.NumCols() && + in_deriv->Stride() == in_deriv->NumCols()); + CuSubMatrix in_value_reshaped(in_value.Data(), new_num_rows, + block_dim_, block_dim_), + out_deriv_reshaped(out_deriv.Data(), new_num_rows, + output_block_dim, output_block_dim), + in_deriv_reshaped(in_deriv->Data(), new_num_rows, + block_dim_, block_dim_); + cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_, + add_log_stddev_, &in_deriv_reshaped); + } else { + cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_, + in_deriv); + } +} + +void BatchNormComponent::ComputeDerived() { + if (!test_mode_) { + offset_.Resize(0); + scale_.Resize(0); + return; + } + + if (count_ == 0.0) { + KALDI_WARN << "Test-mode is set but there is no data count. " + "Creating random counts. This only makes sense " + "in unit-tests (or compute_prob_*.0.log). If you see this " + "elsewhere, something is very wrong."; + count_ = 1.0; + stats_sum_.SetRandn(); + stats_sumsq_.SetRandn(); + stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); + } + + offset_.Resize(block_dim_); + scale_.Resize(block_dim_); + offset_.CopyFromVec(stats_sum_); + offset_.Scale(-1.0 / count_); + // now offset_ is -mean. + scale_.CopyFromVec(stats_sumsq_); + scale_.Scale(1.0 / count_); + scale_.AddVecVec(-1.0, offset_, offset_, 1.0); + // now scale_ is variance. + // Mathematically the ApplyFloor statement should be a no-op; this is in case + // of numerical roundoff. + scale_.ApplyFloor(0.0); + scale_.Add(epsilon_); + BaseFloat power = -0.5; + scale_.ApplyPow(power); + // now scale_ = min(variance, epsilon)^power + // next, multiply by the target RMS (normally 1.0). + scale_.Scale(target_rms_); + offset_.MulElements(scale_); + // now offset_ is -(scale*mean). +} + +void BatchNormComponent::SetTestMode(bool test_mode) { + test_mode_ = test_mode; + ComputeDerived(); +} + +void BatchNormComponent::Check() const { + KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 && + epsilon_ > 0.0 && target_rms_ > 0.0); +} + +BatchNormComponent::BatchNormComponent(const BatchNormComponent &other): + dim_(other.dim_), block_dim_(other.block_dim_), + epsilon_(other.epsilon_), target_rms_(other.target_rms_), + test_mode_(other.test_mode_), count_(other.count_), + stats_sum_(other.stats_sum_), stats_sumsq_(other.stats_sumsq_) { + ComputeDerived(); + Check(); +} + + +std::string BatchNormComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ + << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ + << ", count=" << count_ + << ", test-mode=" << (test_mode_ ? "true" : "false"); + if (count_ > 0) { + Vector mean(stats_sum_), var(stats_sumsq_); + mean.Scale(1.0 / count_); + var.Scale(1.0 / count_); + // subtract mean^2 from var. + var.AddVecVec(-1.0, mean, mean, 1.0); + var.ApplyFloor(0.0); + var.ApplyPow(0.5); // make it the stddev. + stream << ", data-mean=" << SummarizeVector(mean) + << ", data-stddev=" << SummarizeVector(var); + } + return stream.str(); +} + +void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { + dim_ = -1; + block_dim_ = -1; + epsilon_ = 1.0e-03; + target_rms_ = 1.0; + test_mode_ = false; + bool ok = cfl->GetValue("dim", &dim_); + cfl->GetValue("block-dim", &block_dim_); + cfl->GetValue("epsilon", &epsilon_); + cfl->GetValue("target-rms", &target_rms_); + cfl->GetValue("test-mode", &test_mode_); + if (!ok || dim_ <= 0) { + KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0"; + } + if (block_dim_ == -1) + block_dim_ = dim_; + if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 && + epsilon_ > 0 && target_rms_ > 0)) + KALDI_ERR << "Invalid configuration in BatchNormComponent."; + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); + count_ = 0; + stats_sum_.Resize(block_dim_); + stats_sumsq_.Resize(block_dim_); + if (test_mode_) { + ComputeDerived(); + } +} + + + +/* + BATCHNORM_MATH + + This comment describes the equations involved in batch normalization, and + derives the forward and back-propagation. + + This is all dimension-by-dimension, so we just imagine the inputs + are scalars x(i), for i=0 .. n-1. + + FORWARD PASS: + + Let 'power' be a constant, equal to -0.5 for regular batch-norm. + + To simplify the math we (conceptually, not physically) do the normalization in + two stages: first mean, then variance, so we have x(i) -> y(i) -> z(i). + + The name 'rscale' means 'raw scale', meaning the scale before including + target-rms. Later we'll define 'scale = target-rms * rscale', to make some + of the actual computations slightly more efficient. + + Define: mean = 1/I * sum_i x(i) + y(i) = x(i) - mean + + var = 1/I \sum_i y(i)^2 + rscale = sqrt(var + epsilon)^power <---- For regular batchnorm, power == -0.5. + z(i) = target-rms * rscale * y(i) + + + Most of the rest of this comment derives how to compute the derivatives. If + you just want the formulas, please skip to the string 'BACKWARD PASS' below. + + We'll use a notation where an apostrophe on something means (the derivative of + the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on. + We are given y'(i). Propagating the derivatives backward: + + rscale' = (sum_i y(i) z'(i)) * target-rms + = (sum_i z(i) z'(i)) / rscale + + [ note: d(rscale)/d(var) = power * (var + epsilon)^{power - 1} + = power * rscale^{(power-1)/power} ] + + var' = rscale' * power * rscale^{(power-1)/power} + = power * (\sum_i z'(i) z(i)) * rscale^{(power-1)/power - 1} + = power * (\sum_i z'(i) z(i)) * rscale^{-1/power} + + [note: the following formula is of the form "direct term" + "indirect term"] + y'(i) = z'(i) * target-rms * rscale + 2/I y(i) var' + + Now, the above is inconvenient because it contains y(i) which is an intermediate + quantity. We reformulate in terms of z(i), using y(i) = z(i) / (target-rms * rscale), so: + + defining + var_deriv_mod = 2/I * var' / (target-rms * rscale) + = 2/I * power/target-rms * (\sum_i z'(i) z(i)) * rscale^{-(1+power)/power} + we have: + y'(i) = z'(i) * target-rms * rscale + z(i) var_deriv_mod + + Now, + mean' = \sum_i y'(i) + = (target-rms * rscale * \sum_i z'(i)) + (var_deriv_mod \sum_i z(i)) + [... and the 2nd term above is zero when summed over i, because \sum_i z(i) is zero, ...] + = target-rms * rscale * \sum_i z(i) + and: + x'(i) = z'(i) * target-rms * rscale + z(i) var_deriv_mod - 1/I mean' + = z'(i) * target-rms * rscale + z(i) var_deriv_mod - 1/I * target-rms * rscale * \sum_i z'(i) + = target-rms * rscale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod + + It will simplify the code if we define: + + scale = target-rms * rscale. This way, we can write as follows: + + BACKWARD PASS (recap): + + var_deriv_mod = 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power} + .. which for power = -0.5, simplifies to: + var_deriv_mod = -1.0 / (target-rms^2) * (1/I \sum_i z'(i) z(i)) * scale + + x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod + + */ +void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(SameDim(in, *out) && + (in.NumCols() == dim_ || in.NumCols() == block_dim_)); + if (in.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); + int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(), + orig_cols = in.NumCols(), new_rows = orig_rows * ratio, + new_cols = orig_cols / ratio; + CuSubMatrix in_reshaped(in.Data(), new_rows, new_cols, new_cols), + out_reshaped(out->Data(), new_rows, new_cols, new_cols); + return Propagate(indexes, in_reshaped, &out_reshaped); + } + + // From this point, we can assume that the num-cols of 'in' and 'out' + // equals block_dim_. + + if (!test_mode_) { + // search in the comment above for FORWARD PASS to see what is being + // implemented here. + // if this takes too much time due to multiple different CUDA calls, + // we'll consider making a single kernel for some of it. + Memo *memo = new Memo; + int32 num_frames = in.NumRows(), dim = block_dim_; + memo->num_frames = num_frames; + memo->mean_uvar_scale.Resize(5, dim); + CuSubVector mean(memo->mean_uvar_scale, 0), + uvar(memo->mean_uvar_scale, 1), + scale(memo->mean_uvar_scale, 2); + mean.AddRowSumMat(1.0 / num_frames, in, 0.0); + uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0); + scale.CopyFromVec(uvar); + + // by applying this scale at this point, we save a multiply later on. + BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_); + scale.AddVecVec(-var_scale, mean, mean, var_scale); + // at this point, 'scale' contains just the variance (times target-rms^{-2}). + scale.ApplyFloor(0.0); + scale.Add(var_scale * epsilon_); + // Now 'scale' contains the variance floored to zero and then with epsilon + // added [both times 1/target-rms^2]. + scale.ApplyPow(-0.5); + // now 'scale' is the actual scale we'll use. + + // the next command will do no work if out == in, for in-place propagation. + out->CopyFromMat(in); + out->AddVecToRows(-1.0, mean, 1.0); + out->MulColsVec(scale); + return static_cast(memo); + } else { + if (offset_.Dim() != block_dim_) { + if (count_ == 0) + KALDI_ERR << "Test mode set in BatchNormComponent, but no stats."; + else // why was ComputeDerived() not called? + KALDI_ERR << "Code error in BatchNormComponent"; + } + out->CopyFromMat(in); + out->MulColsVec(scale_); + out->AddVecToRows(1.0, offset_, 1.0); + return NULL; + } +} + +void BatchNormComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, // unused + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo_in, + Component *to_update, // unused + CuMatrixBase *in_deriv) const { + + KALDI_ASSERT(SameDim(out_value, out_deriv) && + SameDim(out_value, *in_deriv) && + (out_value.NumCols() == dim_ || + out_value.NumCols() == block_dim_)); + if (out_value.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(out_value.Stride() == out_value.NumCols() && + out_deriv.Stride() == out_deriv.NumCols() && + in_deriv->Stride() == in_deriv->NumCols()); + int32 ratio = dim_ / block_dim_, + orig_rows = out_value.NumRows(), + orig_cols = out_value.NumCols(), + new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; + CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, + new_cols, new_cols), + out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols), + in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols); + // we'll never use in_value, so pass it in unchanged. + Backprop(debug_info, indexes, in_value, + out_value_reshaped, out_deriv_reshaped, + memo_in, to_update, &in_deriv_reshaped); + return; + } + + Memo *memo = static_cast(memo_in); + + if (!test_mode_) { + // search above for BACKWARD PASS for a comment describing the math. + KALDI_ASSERT(memo != NULL && "memo not passed into backprop"); + int32 num_frames = memo->num_frames; + KALDI_ASSERT(out_value.NumRows() == num_frames); + CuSubVector + scale(memo->mean_uvar_scale, 2), + var_deriv_mod(memo->mean_uvar_scale, 3), + temp(memo->mean_uvar_scale, 4); + + // var_deriv_mod is going to contain: + // 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power} + // which for power = -0.5 simplifies to: + // -1.0 / (target_rms * target_rms). + // but for now we don't have the power of 'scale', we'll add that later. + BaseFloat coeff = -1.0 / (target_rms_ * target_rms_ * num_frames); + + var_deriv_mod.AddDiagMatMat(coeff, out_value, kTrans, + out_deriv, kNoTrans, 0.0); + var_deriv_mod.MulElements(scale); + + temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0); + // the following statement does no work if in_deriv and out_deriv are the + // same matrix. + in_deriv->CopyFromMat(out_deriv); + in_deriv->AddVecToRows(1.0, temp); + // At this point, *in_deriv contains + // (z'(i) - 1/I * \sum_i z'(i)) + in_deriv->MulColsVec(scale); + // At this point, *in_deriv contains + // scale * (z'(i) - 1/I * \sum_i z'(i)) + + in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, + var_deriv_mod, 1.0); + + // At this point, *in_deriv contains what we described in the comment + // starting BATCHNORM_MATH as: + // x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod + } else { + KALDI_ASSERT(offset_.Dim() == block_dim_); + // the next call does no work if they point to the same memory. + in_deriv->CopyFromMat(out_deriv); + in_deriv->MulColsVec(scale_); + } +} + +void BatchNormComponent::StoreStats( + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo_in) { + // in test mode this component does not store stats, it doesn't provide the + // kStoresStats flag. + KALDI_ASSERT(!test_mode_); + KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_); + if (out_value.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(out_value.Stride() == out_value.NumCols()); + int32 ratio = dim_ / block_dim_, + orig_rows = out_value.NumRows(), + orig_cols = out_value.NumCols(), + new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; + CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, + new_cols, new_cols); + // we'll never use in_value, so just pass it in unchanged. + StoreStats(in_value, out_value_reshaped, memo_in); + return; + } + + Memo *memo = static_cast(memo_in); + KALDI_ASSERT(out_value.NumRows() == memo->num_frames); + + CuSubVector mean(memo->mean_uvar_scale, 0), + uvar(memo->mean_uvar_scale, 1); + KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0); + BaseFloat num_frames = memo->num_frames; + if (stats_sum_.Dim() != block_dim_) { + stats_sum_.Resize(block_dim_); + stats_sumsq_.Resize(block_dim_); + KALDI_ASSERT(count_ == 0); + } + count_ += num_frames; + stats_sum_.AddVec(num_frames, mean, 1.0); + stats_sumsq_.AddVec(num_frames, uvar, 1.0); +} + +void BatchNormComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &block_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &epsilon_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &target_rms_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &test_mode_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_); + ExpectToken(is, binary, ""); + stats_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + stats_sumsq_.Read(is, binary); + stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); + stats_sum_.Scale(count_); + stats_sumsq_.Scale(count_); + ExpectToken(is, binary, ""); + ComputeDerived(); + Check(); +} + +void BatchNormComponent::Write(std::ostream &os, bool binary) const { + Check(); + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, block_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, epsilon_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, target_rms_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, test_mode_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + CuVector mean(stats_sum_), var(stats_sumsq_); + if (count_ != 0) { + mean.Scale(1.0 / count_); + var.Scale(1.0 / count_); + var.AddVecVec(-1.0, mean, mean, 1.0); + } + WriteToken(os, binary, ""); + mean.Write(os, binary); + WriteToken(os, binary, ""); + var.Write(os, binary); + WriteToken(os, binary, ""); +} + +void BatchNormComponent::Scale(BaseFloat scale) { + if (scale == 0) { + count_ = 0.0; + stats_sum_.SetZero(); + stats_sumsq_.SetZero(); + } else { + count_ *= scale; + stats_sum_.Scale(scale); + stats_sumsq_.Scale(scale); + } +} + + +void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) { + const BatchNormComponent *other = + dynamic_cast(&other_in); + count_ += alpha * other->count_; + stats_sum_.AddVec(alpha, other->stats_sum_); + stats_sumsq_.AddVec(alpha, other->stats_sumsq_); + // this operation might change offset_ and scale_, so we recompute them + // in this instance (but not in Scale()). + ComputeDerived(); +} + +void BatchNormComponent::ZeroStats() { + // We only zero the stats if we're not in test mode. In test mode, this would + // be dangerous as the stats are the source for the transform, and zeroing + // them and then calling ComputeDerived() again would remove the transform + // parameters (offset_ and scale_). + if (!test_mode_) { + count_ = 0.0; + stats_sum_.SetZero(); + stats_sumsq_.SetZero(); + } +} + + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h new file mode 100644 index 00000000000..1806fe38493 --- /dev/null +++ b/src/nnet3/nnet-normalize-component.h @@ -0,0 +1,303 @@ +// nnet3/nnet-normalize-component.h + +// Copyright 2011-2013 Karel Vesely +// 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2013 Xiaohui Zhang +// 2014-2015 Vijayaditya Peddinti +// 2014-2015 Guoguo Chen +// 2015 Daniel Galvez +// 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_ +#define KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_ + +#include "nnet3/nnet-common.h" +#include "nnet3/nnet-component-itf.h" +#include "nnet3/natural-gradient-online.h" +#include + +namespace kaldi { +namespace nnet3 { + +/// @file nnet-normalize-component.h +/// +/// This file contains declarations of components that in one way or +/// another normalize their input: NormalizeComponent and BatchNormComponent. + +/* + NormalizeComponent implements the function: + + y = x * (sqrt(dim(x)) * target-rms) / |x| + + where |x| is the 2-norm of the vector x. I.e. its output is its input + scaled such that the root-mean-square values of its elements equals + target-rms. (As a special case, if the input is zero, it outputs zero). + This is like Hinton's layer-norm, except not normalizing the mean, only + the variance. + + + Note: if you specify add-log-stddev=true, it adds an extra element to + y which equals log(|x| / sqrt(dim(x))). + + + Configuration values accepted: + dim, or input-dim Input dimension of this component, e.g. 1024. + Will be the same as the output dimension if add-log-stddev=false. + block-dim Defaults to 'dim' you may specify a nonzero divisor + of 'dim'. In this case the input dimension will + be interpreted as blocks of dimension 'block-dim' + to which the nonlinearity described above is applied + separately. + add-log-stddev You can set this to true to add an extra output + dimension which will equal |x| / sqrt(dim(x)). + If block-dim is specified, this is done per block. + target-rms This defaults to 1.0, but if set it to another + (nonzero) value, the output will be scaled by this + factor. + */ +class NormalizeComponent: public Component { + public: + explicit NormalizeComponent(const NormalizeComponent &other); + + virtual int32 Properties() const { + return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds| + (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) | + (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0); + } + NormalizeComponent() { } + virtual std::string Type() const { return "NormalizeComponent"; } + virtual void InitFromConfig(ConfigLine *cfl); + virtual Component* Copy() const { return new NormalizeComponent(*this); } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + virtual int32 InputDim() const { return input_dim_; } + virtual int32 OutputDim() const { + return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0)); + } + virtual std::string Info() const; + private: + NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow. + enum { kExpSquaredNormFloor = -66 }; + // kSquaredNormFloor is about 0.7e-20. We need a value that's exactly representable in + // float and whose inverse square root is also exactly representable + // in float (hence, an even power of two). + static const BaseFloat kSquaredNormFloor; + int32 input_dim_; + int32 block_dim_; + BaseFloat target_rms_; // The target rms for outputs, default 1.0. + + bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D))) + // is an extra dimension of the output. +}; + + +/* + BatchNormComponent + + This implements batch normalization; for each dimension of the + input it normalizes the data to be zero-mean, unit-variance. You + can set the block-dim configuration value to implement spatial + batch normalization, see the comment for the variable. + + If you want to combine this with the trainable offset and scale that the + original BatchNorm paper used, then follow this by the + ScaleAndOffsetComponent. + + It's a simple component (uses the kSimpleComponent flag), but it is unusual in + that it will give different results if you call it on half the matrix at a + time. Most of the time this would be pretty harmless, so we still return the + kSimpleComponent flag. We may have to modify the test code a little to + account for this, or possibly remove the kSimpleComponent flag. In some sense + each output Index depends on every input Index, but putting those dependencies + explicitly into the dependency-tracking framework as a GeneralComponent + would be very impractical and might lead to a lot of unnecessary things being + computed. You have to be a bit careful where you put this component, and understand + what you're doing e.g. putting it in the path of a recurrence is a bit problematic + if the minibatch size is small. + + Accepted configuration values: + dim Dimension of the input and output + block-dim Defaults to 'dim', but may be set to a nonzero divisor + of 'dim'. In this case, each block of dimension 'block-dim' + is treated like a separate row of the input matrix, which + means that the stats from n'th element of each + block are pooled into one class, for each n.a + epsilon Small term added to the variance that is used to prevent + division by zero + target-rms This defaults to 1.0, but if set, for instance, to 2.0, + it will normalize the standard deviation of the output to + 2.0. 'target-stddev' might be a more suitable name, but this + was chosen for consistency with NormalizeComponent. + */ +class BatchNormComponent: public Component { + public: + + BatchNormComponent() { } + + // call this with 'true' to set 'test mode' where the batch normalization is + // done with stored stats. There won't normally be any need to specially + // accumulate these stats; they are stored as a matter of course on each + // iteration of training, as for NonlinearComponents, and we'll use the stats + // from the most recent [script-level] iteration. + // (Note: it will refuse to actually set test-mode to true if there + // are no stats stored.) + void SetTestMode(bool test_mode); + + // constructor using another component + BatchNormComponent(const BatchNormComponent &other); + + virtual int32 InputDim() const { return dim_; } + virtual int32 OutputDim() const { return dim_; } + + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + virtual std::string Type() const { return "BatchNormComponent"; } + virtual int32 Properties() const { + // If the block-dim is less than the dim, we need the input and output + // matrices to be contiguous (stride==num-cols), as we'll be reshaping + // internally. This is not much of a cost, because this will be used + // in convnets where we have to do this anyway. + return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace| + kBackpropInPlace| + (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)| + (test_mode_ ? 0 : kUsesMemo|kStoresStats); + } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *, // to_update, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); // This Read function + // requires that the Component has the correct type. + + /// Write component to stream + virtual void Write(std::ostream &os, bool binary) const; + virtual Component* Copy() const { return new BatchNormComponent(*this); } + + virtual void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + virtual void ZeroStats(); + + + virtual void DeleteMemo(void *memo) const { delete static_cast(memo); } + + virtual void StoreStats(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo); + + // Members specific to this component type. + // Note: the offset and scale will only be nonempty in 'test mode'. + const CuVector &Offset() const { return offset_; } + const CuVector &Scale() const { return scale_; } + + private: + + struct Memo { + // number of frames (after any reshaping). + int32 num_frames; + // 'sum_sumsq_scale' is of dimension 5 by block_dim_: + // Row 0 = mean = the mean of the rows of the input + // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames). + // Row 2 = scale = the scale of the renormalization. + // Rows 3 and 4 are used as temporaries in Backprop. + CuMatrix mean_uvar_scale; + }; + + void Check() const; + + // this function is used in a couple of places; it turns the raw stats into + // the offset/scale term of a normalizing transform. + static void ComputeOffsetAndScale(double count, + BaseFloat epsilon, + const Vector &stats_sum, + const Vector &stats_sumsq, + Vector *offset, + Vector *scale); + // computes derived parameters offset_ and scale_. + void ComputeDerived(); + + // Dimension of the input and output. + int32 dim_; + // This would normally be the same as dim_, but if it's less (and it must be > + // 0 and must divide dim_), then each separate block of the input of dimension + // 'block_dim_' is treated like a separate frame for the purposes of + // normalization. This can be used to implement spatial batch normalization + // for convolutional setups-- assuming the filter-dim has stride 1, which it + // always will in the new code in nnet-convolutional-component.h. + int32 block_dim_; + + // Used to avoid exact-zero variances, epsilon has the dimension of a + // covariance. + BaseFloat epsilon_; + + // This value will normally be 1.0, which is the default, but you can set it + // to other values as a way to control how fast the following layer learns + // (smaller -> slower). The same config exists in NormalizeComponent. + BaseFloat target_rms_; + + // This is true if we want the batch normalization to operate in 'test mode' + // meaning the data mean and stddev used for the normalization are fixed + // quantities based on previously accumulated stats. Note: the stats we use + // for this are based on the same 'StoreStats' mechanism as we use for + // components like SigmoidComponent and ReluComponent; we'll be using + // the stats from the most recent [script-level] iteration of training. + bool test_mode_; + + + // total count of stats stored by StoreStats(). + double count_; + // sum-of-data component of stats of input data. + CuVector stats_sum_; + // sum-of-squared component of stats of input data. + CuVector stats_sumsq_; + + // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they + // dictate the transform that is done in 'test mode'. They are set only when + // reading the model from disk and when calling SetTestMode(true); they are + // resized to empty when the stats are updated, to ensure that out-of-date + // values are not kept around. + CuVector offset_; + CuVector scale_; +}; + + + +} // namespace nnet3 +} // namespace kaldi + + +#endif diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc index bcb02184720..35614d62b34 100644 --- a/src/nnet3/nnet-optimize-test.cc +++ b/src/nnet3/nnet-optimize-test.cc @@ -143,7 +143,7 @@ static bool UnitTestNnetOptimizeWithOptions(int32 srand_seed, KALDI_LOG << "Output sum (optimized) is " << output_opt.Sum(); if (!ApproxEqual(output, output_opt)) { KALDI_WARN << "Non-optimized and optimized versions of the computation give " - << "different outputs."; + << "different outputs: " << output << " vs. " << output_opt; return false; } diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index 26aaced54df..c53fba815fb 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -21,7 +21,6 @@ #include "nnet3/nnet-optimize-utils.h" #include "nnet3/nnet-optimize.h" - namespace kaldi { namespace nnet3 { @@ -737,9 +736,7 @@ bool VariableMergingOptimizer::MergeVariables() { // potentially merge into a single variable. const NnetComputation::Command &c = computation_->commands[command_index]; int32 s1 = -1, s2 = -1; - // TODO: add kScale command and remove the check for 1.0 if (c.command_type == kMatrixCopy && - // c.alpha == 1.0 && config_.remove_assignments) { s2 = c.arg1; // s2 is the written-to matrix. s1 = c.arg2; @@ -997,7 +994,7 @@ std::pair VariableMergingOptimizer::MayBeMerged( if (!left && !right) // save some time. return std::pair(false,false); bool is_assignment = (computation_->commands[command_index].command_type == - kMatrixCopy && + kMatrixCopy && computation_->commands[command_index].alpha == 1.0); ComputationAnalysis analysis(*computation_, analyzer_); if (is_assignment) { @@ -1018,6 +1015,268 @@ std::pair VariableMergingOptimizer::MayBeMerged( } +// This class is used inside the function +// `void ExtendMatrices(NnetComputation *computation)`; +// see that function's declaration in nnet-optimize-utils.h for +// a summary of what this class does. +class MatrixExtender { + public: + typedef NnetComputation::SubMatrixInfo SubMatrixInfo; + typedef NnetComputation::MatrixInfo MatrixInfo; + + MatrixExtender(NnetComputation *computation); + + void ExtendMatrices(); + + private: + // This function returns true if a copy command from 'src_submatrix' + // to 'dest_submatrix' has the properties we need to be able to + // extend its rows to cover all of the source matrix. + bool CanBeExtended(int32 dest_submatrix_index, + int32 src_submatrix_index); + + // This actually extends the matrices... it's called only if CanBeExtended() + // with the same args returned true. It modifies 'dest_submatrix_index' + // and 'src_submatrix_index'. + void Extend(int32 *dest_submatrix_index, int32 *src_submatrix_index); + + // This function modifies the computation to fix certain problems + // that might have been introduced by Extend()... allocation, deallocation, + void FixComputation(); + + // This function modifies the computation to fix the debug info; if needed, + // it's called from FixComputation(). + void FixDebugInfo(); + + // don't extend a destination matrix if it wasn't already + // at least 'min_proportion' (80%) big enough to store the source. + BaseFloat min_proportion_; + + NnetComputation *computation_; + + // Indexed by matrix-index m, orig_num_rows_[m] is the value of + // computation_->matrices[m].num_rows when this class was initialized, + // i.e. before we changed anything. + std::vector orig_num_rows_; + + // Indexed by matrix-index m, this vector contains true if matrix + // m is involved in any AcceptInput() or ProvideOutput() operations. + std::vector is_input_or_output_; +}; + +// note: the initializer for min_proportion_ below needs to be kept in sync with +// the min_proportion variable in +// ComputationChecker::CheckComputationUndefined() in nnet-analyze.cc. +MatrixExtender::MatrixExtender(NnetComputation *computation): + min_proportion_(0.8), + computation_(computation) { + int32 num_matrices = computation_->matrices.size(); + + { // set up orig_num_rows_. + orig_num_rows_.resize(num_matrices); + // matrix 0 is not a real matrix so skip that index. + for (int32 m = 1; m < num_matrices; m++) + orig_num_rows_[m] = computation_->matrices[m].num_rows; + } + { // set up is_input_or_output_. + is_input_or_output_.resize(num_matrices, false); + std::vector::iterator + command_iter = computation_->commands.begin(), + command_end = computation_->commands.end(); + for (; command_iter != command_end; ++command_iter) { + const NnetComputation::Command &command = *command_iter; + // make sure there are no kSwapMatrix commands; they should not be present + // at this stage of optimization. + KALDI_ASSERT(command.command_type != kSwapMatrix); + if (command.command_type == kProvideOutput || + command.command_type == kAcceptInput) { + int32 s = command.arg1, + m = computation_->submatrices[s].matrix_index; + is_input_or_output_[m] = true; + } + } + } +} + + +bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index, + int32 src_submatrix_index) { + const SubMatrixInfo + &src_submatrix = computation_->submatrices[src_submatrix_index], + &dest_submatrix = computation_->submatrices[dest_submatrix_index]; + if (src_submatrix.matrix_index == dest_submatrix.matrix_index) + return false; + + // we can't resize the destination matrix if it's involved in input or output. + if (is_input_or_output_[dest_submatrix.matrix_index]) + return false; + + const MatrixInfo + &src_matrix = computation_->matrices[src_submatrix.matrix_index]; + + int32 dest_matrix_orig_num_rows = orig_num_rows_[dest_submatrix.matrix_index], + src_matrix_orig_num_rows = orig_num_rows_[src_submatrix.matrix_index]; + + if (src_submatrix.num_rows < min_proportion_ * src_matrix_orig_num_rows) + return false; + + // The following checks that the source submatrix covers be all of the + // source matrix except a few final rows, and the destination submatrix goes + // to the final row of its matrix. + return (src_submatrix.col_offset == 0 && + src_submatrix.num_cols == src_matrix.num_cols && + src_submatrix.row_offset == 0 && + src_submatrix.num_rows < src_matrix.num_rows && + dest_submatrix.row_offset + dest_submatrix.num_rows == + dest_matrix_orig_num_rows); +} + + +void MatrixExtender::Extend(int32 *dest_submatrix_index, + int32 *src_submatrix_index) { + // copy the SubMatrixInfo to avoid iterator invalidation. + SubMatrixInfo + src_submatrix = computation_->submatrices[*src_submatrix_index], + dest_submatrix = computation_->submatrices[*dest_submatrix_index]; + + MatrixInfo &src_matrix = computation_->matrices[src_submatrix.matrix_index], + &dest_matrix = computation_->matrices[dest_submatrix.matrix_index]; + + int32 new_dest_num_rows = dest_submatrix.row_offset + src_matrix.num_rows; + + // extend the destination matrix so it has enough rows to fit the entire + // source matrix. Note: doing this will break certain invariances in the + // computation, principally with allocation and deallocation commands, which + // we'll later fix up by calling FixComputation(). + if (new_dest_num_rows > dest_matrix.num_rows) { + dest_matrix.num_rows = new_dest_num_rows; + // make sure there's a submatrix index covering the whole of the dest matrix. + computation_->submatrices.push_back( + SubMatrixInfo(dest_submatrix.matrix_index, 0, new_dest_num_rows, + 0, dest_matrix.num_cols)); + } + + // The following 3 statements create a new submatrix that will be + // the destination submatrix; it's the same as the original destination + // submatrix, but with a few extra rows. + *dest_submatrix_index = computation_->submatrices.size(); + dest_submatrix.num_rows = src_matrix.num_rows; + computation_->submatrices.push_back( + SubMatrixInfo(dest_submatrix)); + + // The following 3 statements create a new submatrix that will be + // the source submatrix; it's the same as the original source + // submatrix, but with a few extra rows, and actually will cover + // the entire source matrix. + *src_submatrix_index = computation_->submatrices.size(); + computation_->submatrices.push_back( + SubMatrixInfo(src_submatrix.matrix_index, 0, src_matrix.num_rows, + 0, src_matrix.num_cols)); +} + +void MatrixExtender::ExtendMatrices() { + std::vector::iterator + command_iter = computation_->commands.begin(), + command_end = computation_->commands.end(); + bool changed = false; + for (; command_iter != command_end; ++command_iter) { + NnetComputation::Command &command = *command_iter; + if (command.command_type == kMatrixCopy && + command.alpha == 1.0) { + int32 dest_submatrix_index = command.arg1, + src_submatrix_index = command.arg2; + if (CanBeExtended(dest_submatrix_index, src_submatrix_index)) { + Extend(&command.arg1, &command.arg2); + changed = true; + } + } + } + if (changed) + FixComputation(); +} + +void MatrixExtender::FixComputation() { + // make sure that allocation and deallocation commands + // operate on whole matrix. + std::vector::iterator + command_iter = computation_->commands.begin(), + command_end = computation_->commands.end(); + std::vector whole_submatrices; + computation_->GetWholeSubmatrices(&whole_submatrices); + for (; command_iter != command_end; ++command_iter) { + NnetComputation::Command &command = *command_iter; + if (command.command_type == kAllocMatrix || + command.command_type == kDeallocMatrix) { + int32 s = command.arg1, + m = computation_->submatrices[s].matrix_index, + new_s = whole_submatrices[m]; + if (new_s != s) { + KALDI_ASSERT( + computation_->submatrices[s] == computation_->submatrices[new_s] || + orig_num_rows_[m] != computation_->matrices[m].num_rows); + command.arg1 = new_s; + } + } + if (command.command_type == kSetConst && command.alpha == 0.0) { + int32 s = command.arg1, + m = computation_->submatrices[s].matrix_index, + new_s = whole_submatrices[m]; + if (new_s != s) { + { + const NnetComputation::SubMatrixInfo &info = computation_->submatrices[ + command.arg1]; + const NnetComputation::MatrixInfo &mat_info = computation_->matrices[ + info.matrix_index]; + // If this command wasn't zeroing the the entirety of a matrix, + // (before we extended the matrix), we don't need to extend it. + if (!(info.row_offset == 0 && info.col_offset == 0 && + info.num_cols == mat_info.num_cols && + info.num_rows == orig_num_rows_[info.matrix_index])) + continue; + // I know doing this via 'continue' is odd, but it's done this way to + // avoid invalid iterators still being in scope; I think some runtimes + // check for it. + } + command.arg1 = new_s; + } + } + } + if (!computation_->matrix_debug_info.empty()) + FixDebugInfo(); + RenumberComputation(computation_); +} + +void MatrixExtender::FixDebugInfo() { + int32 num_matrices = computation_->matrices.size(); + // matrix zero is not a 'real' matrix. + for (int32 m = 1; m < num_matrices; m++) { + NnetComputation::MatrixDebugInfo &debug_info = + computation_->matrix_debug_info[m]; + int32 new_num_rows = computation_->matrices[m].num_rows, + old_num_rows = debug_info.cindexes.size(); + if (new_num_rows != old_num_rows) { + debug_info.cindexes.resize(new_num_rows); + int32 num_extra_rows = new_num_rows - old_num_rows; + // the following should be true because min_proportion_ > 0.5. + KALDI_ASSERT(num_extra_rows <= old_num_rows); + for (int32 r = old_num_rows; r < new_num_rows; r++) { + Cindex cindex = debug_info.cindexes[r - num_extra_rows]; + // set the 't' value to kNoTime which indicates that it's not a 'real' + // time step, and may avoid errors in checking code. + cindex.second.t = kNoTime; + debug_info.cindexes[r] = cindex; + } + } + } +} + +void ExtendMatrices(NnetComputation *computation) { + MatrixExtender ext(computation); + ext.ExtendMatrices(); +} + + + /** This class is responsible for consolidating the model-update part of backprop commands, for components in (e.g.) recurrent networks that need to have many separate backprop commands, into more efficient single commands @@ -2555,7 +2814,8 @@ static void ConvertNumNValues(int32 n_stride, int32 old_N, int32 new_N, // This class implements the internals of the ExpandComputation() function (used // in shortcut compilation); see comment by the declaration of -// ExpandComputation() in nnet-optimize-utils.h for overview. +// ExpandComputation() in nnet-optimize-utils.h for overview. (It relates to +// shortcut compilation). class ComputationExpander { public: ComputationExpander(const Nnet &nnet, @@ -2952,6 +3212,7 @@ void ComputationExpander::ComputeCommands() { case kAddRowRanges: ExpandRowRangesCommand(c, &c_out); break; + case kCompressMatrix: case kDecompressMatrix: case kAcceptInput: case kProvideOutput: case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker: case kNoOperationLabel: case kGotoLabel: @@ -3466,13 +3727,12 @@ class ComputationLoopedOptimizer { /// expected to be command indexes of the kNoOperationMarker at segment /// boundaries, this function outputs for each of these command indexes a list /// of matrices which are 'active' at that point in time. By 'active' we mean - /// that the matrix has been written to before that time (note, we don't count - /// initialization with zeros as being written to); and will be read after - /// that time. These is the list of matrices that 'need to be in scope' - /// at those points in time. '*active_matrices' is indexed by the - /// same index as 'splice_point_commands', and is then a list of active - /// matrices, in numerical order of matrix index. - /// Note: for each i, (*active_matrices)[i] will be sorted and unique. + /// that the matrix has been written to before that time (including zeroing), + /// and will be read after that time. These is the list of matrices that + /// 'need to be in scope' at those points in time. '*active_matrices' is + /// indexed by the same index as 'splice_point_commands', and is then a list + /// of active matrices, in numerical order of matrix index. Note: for each i, + /// (*active_matrices)[i] will be sorted and unique. static void FindActiveMatrices(const NnetComputation &computation, const Analyzer &analyzer, const std::vector &splice_point_commands, @@ -4045,5 +4305,328 @@ void RemoveCommandsForUnusedMatrix(const Analyzer &analyzer, } } + + +// This comparison operator is used in the function InsertCommands() +// to sort a list of these pairs by the .first element. +struct CommandPairComparator { + // operator () should be viewed as a '<' operator that only looks at + // the .first element, treating the .second elements as equal. + bool operator () (const std::pair &p1, + const std::pair &p2) const { + return p1.first < p2.first; + } +}; + +void InsertCommands( + std::vector > *new_commands, + NnetComputation *computation) { + int32 num_new_commands = new_commands->size(), + num_old_commands = computation->commands.size(); + if (num_new_commands == 0) + return; + CommandPairComparator comparison_operator; + // use std::stable_sort so that for entries in 'new_commands' that + // have the same .first value, they stay in the same order they were + // in before sorting. + std::stable_sort(new_commands->begin(), new_commands->end(), + comparison_operator); + + if (RandInt(0, 3) == 0) { // check 'new_commands' + for (int32 i = 0; i + 1 < num_new_commands; i++) { + KALDI_ASSERT((*new_commands)[i].first <= (*new_commands)[i+1].first && + (*new_commands)[i].first >= 0 && + (*new_commands)[i+1].first <= num_old_commands); + } + } + std::vector merged_commands; + merged_commands.reserve(num_old_commands + num_new_commands); + + std::vector >::const_iterator + new_commands_iter = new_commands->begin(), + new_commands_end = new_commands->end(); + + for (int32 old_command_index = 0; old_command_index <= num_old_commands; + old_command_index++) { + while (new_commands_iter != new_commands_end && + new_commands_iter->first <= old_command_index) { + merged_commands.push_back(new_commands_iter->second); + ++new_commands_iter; + } + if (old_command_index < num_old_commands) + merged_commands.push_back(computation->commands[old_command_index]); + } + KALDI_ASSERT(merged_commands.size() == num_old_commands + + num_new_commands); + // copy to 'computation->commands' via shallow swap. + computation->commands.swap(merged_commands); + FixGotoLabel(computation); +} + +/** + This class is used in the function OptimizeMemoryCompression(), + once we determine that there is some potential to do memory compression + for this computation. + */ +class MemoryCompressionOptimizer { + public: + + /** @param [in] nnet The neural net the computation is for. + @param [in] memory_compression_level. The level of compression: + 0 = no compression (the constructor should not be calle with this value). + 1 = compression that doesn't affect the results (but still takes time). + 2 = compression that affects the results only very slightly + 3 = compression that affects the results a little more. + @param [in] middle_command Must be the command-index of the + command of type kNoOperationMarker in 'computation'. + @param [in,out] computation The computation we're optimizing. + */ + MemoryCompressionOptimizer(const Nnet &nnet, + int32 memory_compression_level, + int32 middle_command, + NnetComputation *computation): + nnet_(nnet), memory_compression_level_(memory_compression_level), + middle_command_(middle_command), computation_(computation) { } + + void Optimize(); + private: + + // This function, called from Compress(), figures out whether we can compress + // matrix m, and if so, adds an entry to compress_info_. + void ProcessMatrix(int32 m); + + // This function modifies the commands in '*computation_', taking + // as input the commands in compress_info_. + void ModifyComputation(); + + // While deciding what matrices to compress we will create a list of structs + // of type MatrixCompressInfo. Later we copy-and-modify the commands in the + // computation, putting the compression commands into their appropriate place. + struct MatrixCompressInfo { + // m is the matrix-index of the matrix we're going to compress. + int32 m; + // compression_command_index is the command-index of the command + // *after* which we will place the compression command. Normally + // this will be some type of propagation. + int32 compression_command_index; + // compression_command_index is the command-index of the command + // *before* which we will place the uncompression command. Normally + // this will be some type of backprop. + int32 uncompression_command_index; + // 'compression_type' (e.g. kCompressedMatrixInt8) determines the type + // we compress the BaseFloats to. + CuCompressedMatrixType compression_type; + // 'range' determines range of values that the compressed values can + // be in: for signed types they are in [-range, range], for unsigned + // types, in [0, range]. + // As a special case, range = 0 means that the compression just stores the + // sign (-1, 0 or 1) of the input, and decompresses it to -1, 0 or 1; this + // is useful for ReLUs. + BaseFloat range; + // this is provided to the initializer of CuCompressedMatrix; it should + // be true if the values being compressed are potentially outside of + // the representable range. + bool truncate; + MatrixCompressInfo(int32 m, int32 forward_command_index, + int32 backward_command_index, + CuCompressedMatrixType compression_type, + BaseFloat range, bool truncate): + m(m), compression_command_index(forward_command_index), + uncompression_command_index(backward_command_index), + compression_type(compression_type), range(range), + truncate(truncate) { } + + }; + std::vector compress_info_; + + const Nnet &nnet_; + int32 memory_compression_level_; + int32 middle_command_; + NnetComputation *computation_; + Analyzer analyzer_; +}; + + +void MemoryCompressionOptimizer::ModifyComputation() { + // whole_submatrices[m] is the submatrix-index of the submatrix that + // represents the whole of matrix m. + std::vector whole_submatrices; + computation_->GetWholeSubmatrices(&whole_submatrices); + + // 'pairs_to_insert' will be a list of pairs (command-index, command), + // meaning: (command-index just before which to insert this command; command + // to insert). + std::vector > + pairs_to_insert; + pairs_to_insert.reserve(compress_info_.size() * 2); + for (size_t i = 0; i < compress_info_.size(); i++) { + const MatrixCompressInfo &info = compress_info_[i]; + int32 s = whole_submatrices[info.m]; + // below we use compression_command_index + 1 because we want the + // compression to go after the command in 'info.compression_command_index' + // (which might be, for instance, a forward propagation command). + std::pair p1( + info.compression_command_index + 1, + NnetComputation::Command(info.range, kCompressMatrix, + s, static_cast(info.compression_type), + info.truncate ? 1 : 0)); + pairs_to_insert.push_back(p1); + std::pair p2( + info.uncompression_command_index, + NnetComputation::Command(1.0, kDecompressMatrix, s)); + pairs_to_insert.push_back(p2); + } + InsertCommands(&pairs_to_insert, + computation_); +} + + +void MemoryCompressionOptimizer::Optimize() { + analyzer_.Init(nnet_, *computation_); + // note: matrix zero is not really a matrix. + int32 num_matrices = computation_->matrices.size(); + for (int32 m = 1; m < num_matrices; m++) + ProcessMatrix(m); + if (!compress_info_.empty()) + ModifyComputation(); +} + +void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { + if (analyzer_.matrix_accesses[m].is_output) { + return; // We can't do this optimization for matrices that are going to be + // output to the user. + } + + // 'accesses' list the commands that access this matrix. + const std::vector &accesses = analyzer_.matrix_accesses[m].accesses; + // the 'kReadAccess' below is actually a don't-care This is just + // to find the position in 'accesses' that corresponds to command-index + // 'middle_command'. + Access middle_access(middle_command_, kReadAccess); + std::vector::const_iterator iter = std::lower_bound(accesses.begin(), + accesses.end(), + middle_access); + // At this point, 'iter' points to the first access in 'accesses' + // whose command index is >= 'middle_command_' (which separates the forward + // and backward passes), or accesses.end() if this matrix was not + // accessed during the backward pass. + if (iter == accesses.end()) { + return; // There is nothing to do: this matrix was not accessed during the + // backward pass. + } + if (iter == accesses.begin()) { + return; // There is nothing to do: this matrix was not accessed during the + // forward pass. + } + // 'backward_access' is the first access of the matrix in the backward + // pass of the computation, and + // 'forward_access' is the last access of the matrix in the forward pass + // of the computation. + const Access &backward_access = iter[0], + &forward_access = iter[-1]; + KALDI_ASSERT(forward_access.command_index < middle_command_ && + backward_access.command_index > middle_command_); + + // 'backward_access_is_last_access' is going to be set to true if + // 'backward_access' is the last command to access the matrix (apart from + // deallocation or matrix-swap commands, which don't show up in the list of + // accesses). + bool backward_access_is_last_access = (accesses.end() == iter + 1); + + int32 backward_command_index = backward_access.command_index, + forward_command_index = forward_access.command_index; + NnetComputation::Command + &backward_command = computation_->commands[backward_command_index]; + + if (memory_compression_level_ >= 1 && + backward_access_is_last_access && + backward_access.access_type == kReadAccess && + backward_command.command_type == kBackprop) { + int32 component_index = backward_command.arg1; + const Component *component = nnet_.GetComponent(component_index); + // this is potentially a candidate for our optimization for ReLU units, + // where we only need to store the sign. + if (component->Type() == "RectifiedLinearComponent") { + compress_info_.push_back( + MatrixCompressInfo(m, forward_command_index, + backward_command_index, + kCompressedMatrixUint8, 0.0, + true)); + return; + } + } + + // If memory_compression_level >= 2 (an "intermediate" level of compression), + // then we'll consider compressing quantities using 16 bits in the range + // [-10, 10]. Because of the way this compression works, exact zero will + // still be uncompressed as exact zero, so even if this is the output + // of a ReLU, it's OK. (Having a few derivatives zero for ReLU outputs + // that were very close to zero is OK.) + if (memory_compression_level_ >= 2) { + compress_info_.push_back( + MatrixCompressInfo(m, forward_command_index, + backward_command_index, + kCompressedMatrixInt16, 10.0, + true)); + return; + } + + // TODO: later maybe implement something for memory compression level = 3. +} + + + + +void OptimizeMemoryCompression(const Nnet &nnet, + int32 memory_compression_level, + NnetComputation *computation) { + if (memory_compression_level == 0 || computation->commands.empty()) + return; + // don't apply this optimization to looped computations. + if (computation->commands.back().command_type == kGotoLabel) + return; + + // 'middle_command' will be the index of the command of type + // 'kNoOperationMarker' that separates the forward and backward + // passes. If it doesn't exist, it means this computation doesn't + // include + int32 middle_command = -1; + for (size_t i = 0; i < computation->commands.size(); i++) { + if (computation->commands[i].command_type == kNoOperationMarker) { + if (middle_command < 0) { + middle_command = static_cast(i); + } else { + KALDI_WARN << "Found more than one command of type kNoOperationMarker " + "in non-looped computation."; + // there are more than one command of this type... this wasn't expected. + // return (i.e. do nothing). + return; + } + } + } + if (middle_command == -1) { + return; // This computation doesn't have a backprop pass. + } + if (memory_compression_level >= 1) { + int64 bytes_used_initial, bytes_used_final; + if (GetVerboseLevel() >= 2) + bytes_used_initial = GetMaxMemoryUse(*computation); + + MemoryCompressionOptimizer opt(nnet, memory_compression_level, + middle_command, computation); + opt.Optimize(); + + if (GetVerboseLevel() >= 2) { + bytes_used_final = GetMaxMemoryUse(*computation); + if (bytes_used_final != bytes_used_initial) { + KALDI_VLOG(2) << "Memory compression reduced memory use from " + << bytes_used_initial << " to " + << bytes_used_final << " bytes."; + } + } + } +} + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h index 98615e2e146..703f43af095 100644 --- a/src/nnet3/nnet-optimize-utils.h +++ b/src/nnet3/nnet-optimize-utils.h @@ -181,6 +181,16 @@ class VariableMergingOptimizer { bool already_called_merge_variables_; }; +/** + This is not really an optimization in itself but it can make things easier + for class VariableMergingOptimizer (usually called by its wrapper + VariableMergingOptimization()). It looks for a case where most of a matrix + (but not its final rows) are copied to some submatrix of another matrix, + where the row-range of that submatrix extends to the last row of the other + matrix; and it extends the other matrix with additional rows so that the + entire source matrix can be copied to the destination. + */ +void ExtendMatrices(NnetComputation *computation); /** @@ -524,6 +534,46 @@ void IdentifyIndexesArgs(std::vector *commands, void IdentifyIndexesRangesArgs(std::vector *commands, std::vector *indexes_ranges_args); +/// Inserts commands into the computation at the requested places. 'commands' +/// is a list of pairs (command-index, command) that is expected to be sorted +/// on command-index. For each entry (c, command) in 'commands', 'command' is +/// inserted into 'computation' just *before* the command that (at entry) is in +/// computation->commands[c]. If there are multiple pairs with the same index +/// c, they will remain in the same order in which they were present in +/// 'commands'; however, 'commands' does not have to be sorted on 'c'. +/// As a special case, if c == computation->commands.size(), the +/// corresponding commands are inserted at the beginning of the computation. +/// This function will appropriately renumber the argument of the kGotoLabel +/// command of any 'looped' computation. Command indexes c in commands[*].first +/// must be in the range [0, computation->commands.size()]. +/// This function may modify 'commands' by sorting it. +void InsertCommands( + std::vector > *commands, + NnetComputation *computation); + +/// Performs optimization to reduce memory usage where possible, +/// making use of the kCompressMatrix and kDecompressMatrix commands. +/// Should only be done after most other optimizations, because some +/// optimizations (such as variable-merging) would not work correctly +/// after doing this optimization. This does nothing for looped +/// computations. It's OK, though, to expand a shortcut computation +/// (i.e. call ExpandComputation) after doing this. +/// +/// memory_compression_level determines how aggressive the compression +/// is. Allowed values: +/// 0 = no compression at all +/// 1 = compression that doesn't affect results (e.g. compress +/// ReLU outputs to 1 byte, as just the sign is needed). +/// 2 = compression that may affect the results slightly (e.g. 16-bit +/// compression of the output of NormalizeComponent and the like), +/// but this is not implemented yet, so equivalent to 1. +/// 3 = compression that may affect the results more than just +/// slightly. Not implemented yet, so equivalent to 1. +void OptimizeMemoryCompression(const Nnet &nnet, + int32 memory_compression_level, + NnetComputation *computation); + + /// This function tries to optimize computation 'computation' for an 'looped' /// computation. It expects as input a computation with no backprop but with /// multiple 'segments' separated by command kNoOperationLabel, where each diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index 7824ee88b5a..d614afce7d0 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -439,7 +439,7 @@ void ConvertAdditionToAssignment(const Nnet &nnet, case kMatrixAdd: c.command_type = kMatrixCopy; break; case kAddRows: c.command_type = kCopyRows; - break; + break; case kAddRowsMulti: c.command_type = kCopyRowsMulti; break; // note: kCopyToRowsMulti does not currently support alpha != 1.0. @@ -515,13 +515,6 @@ void Optimize(const NnetOptimizeOptions &config, CheckComputation(nnet, *computation, true); } - if (config.optimize && - (config.remove_assignments || config.backprop_in_place || - config.propagate_in_place)) { - VariableMergingOptimization(config, nnet, computation); - if (GetVerboseLevel() >= 3) - CheckComputation(nnet, *computation, false); - } if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) { bool must_renumber = false; @@ -536,6 +529,21 @@ void Optimize(const NnetOptimizeOptions &config, } } + if (config.optimize && config.extend_matrices && + !config.optimize_looped_computation) { + ExtendMatrices(computation); + if (GetVerboseLevel() >= 3) + CheckComputation(nnet, *computation, false); + } + + + if (config.optimize && + (config.remove_assignments || config.backprop_in_place || + config.propagate_in_place)) { + VariableMergingOptimization(config, nnet, computation); + if (GetVerboseLevel() >= 3) + CheckComputation(nnet, *computation, false); + } if (config.optimize && config.initialize_undefined) { RemoveUnnecessaryZeroing(nnet, computation); @@ -543,7 +551,9 @@ void Optimize(const NnetOptimizeOptions &config, CheckComputation(nnet, *computation, false); } - if (config.optimize && config.move_sizing_commands) { + + if ((config.optimize && config.move_sizing_commands) || + config.optimize_looped_computation) { MoveSizingCommands(nnet, computation); if (GetVerboseLevel() >= 3) CheckComputation(nnet, *computation, false); @@ -552,7 +562,7 @@ void Optimize(const NnetOptimizeOptions &config, // the looped computation optimization has to go before // 'RemoveUnnecessaryAllocation()'. We don't gate this by 'config.optimize' // because it's necessary for looped computation to run. - if (config.optimize_looped_computation){ + if (config.optimize_looped_computation) { OptimizeLoopedComputation(nnet, computation); if (GetVerboseLevel() >= 3) CheckComputation(nnet, *computation, false); @@ -577,11 +587,21 @@ void Optimize(const NnetOptimizeOptions &config, if (config.optimize_looped_computation) FixGotoLabel(computation); + + if (config.memory_compression_level > 0 && + !config.optimize_looped_computation) { + OptimizeMemoryCompression(nnet, config.memory_compression_level, + computation); + if (GetVerboseLevel() >= 3) + CheckComputation(nnet, *computation, false); + } + if (GetVerboseLevel() >= 3) { CheckComputation(nnet, *computation, false); KALDI_LOG << "After optimization, max memory use (bytes) = " << GetMaxMemoryUse(*computation); } + } // ComputationRequests are distinguished by the names and indexes diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index 4ffa4de449e..31872e46b72 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -32,12 +32,14 @@ namespace nnet3 { // Options class for optimizing a NnetComputation. The main projected use for // this is in debugging the optimization code itself, so that if an error is // detected, we can work out which optimization was responsible for the error. +// See the Register() function below for option-specific documentation. struct NnetOptimizeOptions { bool optimize; // setting this false disallow all optimization. bool consolidate_model_update; bool propagate_in_place; bool backprop_in_place; bool optimize_row_ops; + bool extend_matrices; bool convert_addition; bool remove_assignments; bool allow_left_merge; @@ -49,6 +51,7 @@ struct NnetOptimizeOptions { int32 max_deriv_time; int32 max_deriv_time_relative; bool snip_row_ops; + int32 memory_compression_level; // optimize_looped_computation is a 'hidden config' not available from // the command line; it's set to true to enable the optimization for // looped computation that turns a linear computation into a loop. @@ -60,6 +63,7 @@ struct NnetOptimizeOptions { propagate_in_place(true), backprop_in_place(true), optimize_row_ops(true), + extend_matrices(true), convert_addition(true), remove_assignments(true), allow_left_merge(true), @@ -71,6 +75,7 @@ struct NnetOptimizeOptions { max_deriv_time(std::numeric_limits::max()), max_deriv_time_relative(std::numeric_limits::max()), snip_row_ops(true), + memory_compression_level(1), optimize_looped_computation(false) { } void Register(OptionsItf *opts) { @@ -84,6 +89,9 @@ struct NnetOptimizeOptions { "disable optimization that allows in-place propagation"); opts->Register("backprop-in-place", &backprop_in_place, "Set to false to " "disable optimization that allows in-place backprop"); + opts->Register("extend-matrices", &extend_matrices, "This optimization " + "can reduce memory requirements for TDNNs when applied " + "together with --convert-addition=true"); opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to " "disable certain optimizations that act on operations of " "type *Row*."); @@ -123,6 +131,14 @@ struct NnetOptimizeOptions { opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to " "disable an optimization that reduces the size of certain " "per-row operations"); + opts->Register("memory-compression-level", &memory_compression_level, + "This is only relevant to training, not decoding. Set this " + "to 0,1,2; higher levels are more aggressive at reducing " + "memory by compressing quantities needed for backprop, " + "potentially at the expense of speed and the accuracy " + "of derivatives. 0 means no compression at all; 1 means " + "compression that shouldn't affect results at all."); + } void Read(std::istream &is, bool binary); void Write(std::ostream &os, bool binary) const; diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index 2c4da825013..bb3a209460a 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -481,7 +481,7 @@ static void PrintFloatSuccinctly(std::ostream &os, BaseFloat f) { // Returns a string that summarizes a vector fairly succintly, for // printing stats in info lines. -std::string SummarizeVector(const Vector &vec) { +std::string SummarizeVector(const VectorBase &vec) { std::ostringstream os; if (vec.Dim() < 10) { os << "[ "; @@ -517,6 +517,16 @@ std::string SummarizeVector(const Vector &vec) { return os.str(); } +std::string SummarizeVector(const VectorBase &vec) { + Vector vec_copy(vec); + return SummarizeVector(vec_copy); +} + +std::string SummarizeVector(const CuVectorBase &cu_vec) { + Vector vec(cu_vec); + return SummarizeVector(vec); +} + void PrintParameterStats(std::ostringstream &os, const std::string &name, const CuVectorBase ¶ms, @@ -537,7 +547,10 @@ void PrintParameterStats(std::ostringstream &os, void PrintParameterStats(std::ostringstream &os, const std::string &name, const CuMatrix ¶ms, - bool include_mean) { + bool include_mean, + bool include_row_norms, + bool include_column_norms, + bool include_singular_values) { os << std::setprecision(4); os << ", " << name << '-'; int32 dim = params.NumRows() * params.NumCols(); @@ -551,8 +564,26 @@ void PrintParameterStats(std::ostringstream &os, os << "rms=" << rms; } os << std::setprecision(6); // restore the default precision. - if (GetVerboseLevel() >= 2) { - // At verbose level >= 2, print stats of the singular values of the matrix. + + if (include_row_norms) { + CuVector row_norms(params.NumRows()); + row_norms.AddDiagMat2(1.0, params, kNoTrans, 0.0); + row_norms.ApplyPow(0.5); + Vector row_norms_cpu; + row_norms.Swap(&row_norms_cpu); + os << ", " << name << "-row-norms=" + << SummarizeVector(row_norms_cpu); + } + if (include_column_norms) { + CuVector col_norms(params.NumCols()); + col_norms.AddDiagMat2(1.0, params, kTrans, 0.0); + col_norms.ApplyPow(0.5); + Vector col_norms_cpu; + col_norms.Swap(&col_norms_cpu); + os << ", " << name << "-col-norms=" + << SummarizeVector(col_norms_cpu); + } + if (include_singular_values) { Matrix params_cpu(params); Vector s(std::min(params.NumRows(), params.NumCols())); params_cpu.Svd(&s); diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h index fef21301ff6..0b2e0041aaa 100644 --- a/src/nnet3/nnet-parse.h +++ b/src/nnet3/nnet-parse.h @@ -189,9 +189,16 @@ std::string ErrorContext(std::istream &is); std::string ErrorContext(const std::string &str); -// Returns a string that summarizes a vector fairly succintly, for -// printing stats in info lines. -std::string SummarizeVector(const Vector &vec); +/** Returns a string that summarizes a vector fairly succintly, for + printing stats in info lines. For example: + "[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.001,0.003,0.003,0.004 \ + 0.005,0.01,0.07,0.11,0.14 0.18,0.24,0.29,0.39), mean=0.0745, stddev=0.0611]" +*/ +std::string SummarizeVector(const VectorBase &vec); + +std::string SummarizeVector(const VectorBase &vec); + +std::string SummarizeVector(const CuVectorBase &vec); /** Print to 'os' some information about the mean and standard deviation of some parameters, used in Info() functions in nnet-simple-component.cc. @@ -213,13 +220,25 @@ void PrintParameterStats(std::ostringstream &os, PrintParameterStats(os, "linear-params", linear_params_; would print to 'os' something like the string ", linear-params-rms=0.239". - If you set include_mean to true, it will print something like + If you set 'include_mean' to true, it will print something like ", linear-params-{mean-stddev}=0.103,0.183". + If you set 'include_row_norms' to true, it will print something + like + ", linear-params-row-norms=[percentiles(0,1........, stddev=0.0508]" + If you set 'include_column_norms' to true, it will print something + like + ", linear-params-col-norms=[percentiles(0,1........, stddev=0.0508]" + If you set 'include_singular_values' to true, it will print something + like + ", linear-params-singular-values=[percentiles(0,1........, stddev=0.0508]" */ void PrintParameterStats(std::ostringstream &os, const std::string &name, const CuMatrix ¶ms, - bool include_mean = false); + bool include_mean = false, + bool include_row_norms = false, + bool include_column_norms = false, + bool include_singular_values = false); } // namespace nnet3 diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index c6d2c1f7952..b3cf89ae6b4 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -313,181 +313,6 @@ void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); } -const BaseFloat NormalizeComponent::kSquaredNormFloor = - pow(2.0, NormalizeComponent::kExpSquaredNormFloor); - -NormalizeComponent::NormalizeComponent(const NormalizeComponent &other): - input_dim_(other.input_dim_), block_dim_(other.block_dim_), - target_rms_(other.target_rms_), - add_log_stddev_(other.add_log_stddev_) { } - -void NormalizeComponent::InitFromConfig(ConfigLine *cfl) { - input_dim_ = 0; - add_log_stddev_ = false; - target_rms_ = 1.0; - bool ok = cfl->GetValue("dim", &input_dim_) || - cfl->GetValue("input-dim", &input_dim_); - block_dim_ = input_dim_; - cfl->GetValue("block-dim", &block_dim_); - cfl->GetValue("target-rms", &target_rms_); - cfl->GetValue("add-log-stddev", &add_log_stddev_); - if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 || - block_dim_ <= 0 || input_dim_ % block_dim_ != 0) - KALDI_ERR << "Invalid initializer for layer of type " - << Type() << ": \"" << cfl->WholeLine() << "\""; -} - -void NormalizeComponent::Read(std::istream &is, bool binary) { - std::string token; - ReadToken(is, binary, &token); - if (token == "") { - ReadToken(is, binary, &token); - } - KALDI_ASSERT(token == "" || token == ""); - ReadBasicType(is, binary, &input_dim_); // Read dimension. - ReadToken(is, binary, &token); - if (token == "") { - ReadBasicType(is, binary, &block_dim_); - ReadToken(is, binary, &token); - } else { - block_dim_ = input_dim_; - } - // read target_rms_ if it is available. - if (token == "") { - ReadBasicType(is, binary, &target_rms_); - ReadToken(is, binary, &token); - } - // Read add_log_stddev_ token, if it is available. - if (token == "") { - ReadBasicType(is, binary, &add_log_stddev_); - ReadToken(is, binary, &token); - } else { - add_log_stddev_ = false; - } - if (token == "") { - // back-compatibility code. - CuVector temp; - temp.Read(is, binary); - ExpectToken(is, binary, ""); - temp.Read(is, binary); - ExpectToken(is, binary, ""); - double count; - ReadBasicType(is, binary, &count); - ReadToken(is, binary, &token); - } - KALDI_ASSERT(token == ""); -} - -void NormalizeComponent::Write(std::ostream &os, bool binary) const { - WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_dim_); - if (block_dim_ != input_dim_) { - WriteToken(os, binary, ""); - WriteBasicType(os, binary, block_dim_); - } - WriteToken(os, binary, ""); - WriteBasicType(os, binary, target_rms_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, add_log_stddev_); - WriteToken(os, binary, ""); -} - -std::string NormalizeComponent::Info() const { - std::ostringstream stream; - stream << Type() << ", input-dim=" << InputDim() - << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_ - << ", add-log-stddev=" << std::boolalpha << add_log_stddev_; - if (block_dim_ != input_dim_) - stream << ", block-dim=" << block_dim_; - return stream.str(); -} - -// The output y_i = scale * x_i, -// and we want to RMS value of the y_i to equal target_rms, -// so y^t y = D * target_rms^2 (if y is one row of the input). -// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). -// there is also flooring involved, to avoid division-by-zero -// problems. It's important for the backprop, that the floor's -// square root is exactly representable as float. -// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) -// is an extra dimension of the output. -void* NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() && - in.NumRows() == out->NumRows()); - if (block_dim_ != input_dim_) { - int32 num_blocks = input_dim_ / block_dim_, - new_num_rows = in.NumRows() * num_blocks, - output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0); - KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); - CuSubMatrix in_reshaped(in.Data(), new_num_rows, - block_dim_, block_dim_), - out_reshaped(out->Data(), new_num_rows, - output_block_dim, output_block_dim); - cu::NormalizePerRow(in_reshaped, target_rms_, add_log_stddev_, - &out_reshaped); - } else { - cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out); - } - return NULL; -} - -/* - A note on the derivative of NormalizeComponent... - let both row_in and row_out be vectors of dimension D. - Let p = row_in^T row_in / (D * target_rms^2), and let - f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as: - row_out = f row_in. - Suppose we have a quantity deriv_out which is the derivative - of the objective function w.r.t. row_out. We want to compute - deriv_in which is the derivative of the objective function w.r.t. - row_in. Let the objective function be F. One term is obvious: we have - deriv_in = f deriv_out + .... - next we have to take into account the derivative that gets back-propagated - through f. Obviously, dF/df = deriv_out^T row_in. - And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3), - and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued]. - So this term in dF/d(row_in) equals: - dF/df df/dp dp/d(row_in) = 2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in - So - deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / (D * target_rms^2) ) (deriv_out^T row_in) row_in - - if add_log_stddev_ true, the deriv_in has another term as - dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x) -*/ -void NormalizeComponent::Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update, - CuMatrixBase *in_deriv) const { - if (!in_deriv) - return; - if (block_dim_ != input_dim_) { - int32 num_blocks = input_dim_ / block_dim_, - new_num_rows = in_value.NumRows() * num_blocks, - output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0); - KALDI_ASSERT(in_value.Stride() == in_value.NumCols() && - out_deriv.Stride() == out_deriv.NumCols() && - in_deriv->Stride() == in_deriv->NumCols()); - CuSubMatrix in_value_reshaped(in_value.Data(), new_num_rows, - block_dim_, block_dim_), - out_deriv_reshaped(out_deriv.Data(), new_num_rows, - output_block_dim, output_block_dim), - in_deriv_reshaped(in_deriv->Data(), new_num_rows, - block_dim_, block_dim_); - cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_, - add_log_stddev_, &in_deriv_reshaped); - } else { - cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_, - in_deriv); - } -} - void* SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { @@ -506,8 +331,10 @@ void SigmoidComponent::Backprop(const std::string &debug_info, if (in_deriv != NULL) { in_deriv->DiffSigmoid(out_value, out_deriv); SigmoidComponent *to_update = dynamic_cast(to_update_in); - if (to_update != NULL) + if (to_update != NULL) { RepairGradients(out_value, in_deriv, to_update); + to_update->StoreBackpropStats(out_deriv); + } } } @@ -1015,8 +842,10 @@ void TanhComponent::Backprop(const std::string &debug_info, if (in_deriv != NULL) { in_deriv->DiffTanh(out_value, out_deriv); TanhComponent *to_update = dynamic_cast(to_update_in); - if (to_update != NULL) + if (to_update != NULL) { RepairGradients(out_value, in_deriv, to_update); + to_update->StoreBackpropStats(out_deriv); + } } } @@ -1065,8 +894,10 @@ void RectifiedLinearComponent::Backprop( in_deriv->MulElements(out_deriv); RectifiedLinearComponent *to_update = dynamic_cast(to_update_in); - if (to_update != NULL) + if (to_update != NULL) { RepairGradients(in_deriv, to_update); + to_update->StoreBackpropStats(out_deriv); + } } } @@ -1200,13 +1031,15 @@ void AffineComponent::Add(BaseFloat alpha, const Component &other_in) { AffineComponent::AffineComponent(const AffineComponent &component): UpdatableComponent(component), linear_params_(component.linear_params_), - bias_params_(component.bias_params_) { } + bias_params_(component.bias_params_), + orthonormal_constraint_(component.orthonormal_constraint_) { } AffineComponent::AffineComponent(const CuMatrixBase &linear_params, const CuVectorBase &bias_params, BaseFloat learning_rate): linear_params_(linear_params), - bias_params_(bias_params) { + bias_params_(bias_params), + orthonormal_constraint_(0.0) { SetUnderlyingLearningRate(learning_rate); KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&& bias_params.Dim() != 0); @@ -1232,7 +1065,13 @@ void AffineComponent::PerturbParams(BaseFloat stddev) { std::string AffineComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info(); - PrintParameterStats(stream, "linear-params", linear_params_); + if (orthonormal_constraint_ != 0.0) + stream << ", orthonormal-constraint=" << orthonormal_constraint_; + PrintParameterStats(stream, "linear-params", linear_params_, + false, // include_mean + true, // include_row_norms + true, // include_column_norms + GetVerboseLevel() >= 2); // include_singular_values PrintParameterStats(stream, "bias", bias_params_, true); return stream.str(); } @@ -1294,6 +1133,8 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) { Init(input_dim, output_dim, param_stddev, bias_stddev); } + cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); + if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); @@ -1362,6 +1203,12 @@ void AffineComponent::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); } + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &orthonormal_constraint_); + } else { + orthonormal_constraint_ = 0.0; + } ExpectToken(is, binary, ""); } @@ -1371,6 +1218,10 @@ void AffineComponent::Write(std::ostream &os, bool binary) const { linear_params_.Write(os, binary); WriteToken(os, binary, ""); bias_params_.Write(os, binary); + if (orthonormal_constraint_ != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, orthonormal_constraint_); + } WriteToken(os, binary, ""); } @@ -1688,7 +1539,7 @@ void NaturalGradientRepeatedAffineComponent::Update( try { // Only apply the preconditioning/natural-gradient if we're not computing // the exact gradient. - preconditioner_in_.PreconditionDirections(&deriv, NULL, &scale); + preconditioner_in_.PreconditionDirections(&deriv, &scale); } catch (...) { int32 num_bad_rows = 0; for (int32 i = 0; i < out_deriv.NumRows(); i++) { @@ -2103,12 +1954,6 @@ void PerElementScaleComponent::Backprop( PerElementScaleComponent *to_update = dynamic_cast(to_update_in); - if (in_deriv) { - // Propagate the derivative back to the input. - in_deriv->CopyFromMat(out_deriv); - in_deriv->MulColsVec(scales_); - } - if (to_update != NULL) { // Next update the model (must do this 2nd so the derivatives we propagate // are accurate, in case this == to_update_in.) @@ -2117,6 +1962,13 @@ void PerElementScaleComponent::Backprop( else // the call below is to a virtual function that may be re-implemented to_update->Update(debug_info, in_value, out_deriv); // by child classes. } + + if (in_deriv) { + // Propagate the derivative back to the input. + if (in_deriv->Data() != out_deriv.Data()) + in_deriv->CopyFromMat(out_deriv); + in_deriv->MulColsVec(scales_); + } } void PerElementScaleComponent::Read(std::istream &is, bool binary) { @@ -2303,7 +2155,7 @@ void PerElementOffsetComponent::Backprop( // this scenario) CuMatrix out_deriv_copy(out_deriv_reshaped); BaseFloat scale = 1.0; - to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, NULL, + to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, &scale); to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); @@ -2588,7 +2440,7 @@ void ScaleAndOffsetComponent::BackpropInternal( BaseFloat scale = 1.0; CuMatrix out_deriv_copy(out_deriv); to_update->offset_preconditioner_.PreconditionDirections( - &out_deriv_copy, NULL, &scale); + &out_deriv_copy, &scale); to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } @@ -2611,7 +2463,7 @@ void ScaleAndOffsetComponent::BackpropInternal( BaseFloat scale = 1.0; if (to_update->use_natural_gradient_ && !to_update->is_gradient_) { to_update->scale_preconditioner_.PreconditionDirections( - &in_value_reconstructed, NULL, &scale); + &in_value_reconstructed, &scale); } to_update->scales_.AddRowSumMat(scale * to_update->learning_rate_, in_value_reconstructed); @@ -2677,7 +2529,7 @@ void ConstantFunctionComponent::Backprop( CuMatrix out_deriv_copy(out_deriv); BaseFloat scale = 1.0; to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, - NULL, &scale); + &scale); to_update->output_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } else { @@ -2820,16 +2672,35 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { linear_params_.Read(is, binary); ExpectToken(is, binary, ""); bias_params_.Read(is, binary); + + BaseFloat num_samples_history, alpha; + int32 rank_in, rank_out, update_period; + ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &rank_in_); + ReadBasicType(is, binary, &rank_in); ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &rank_out_); + ReadBasicType(is, binary, &rank_out); + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &orthonormal_constraint_); + } else { + orthonormal_constraint_ = 0.0; + } ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &update_period_); + ReadBasicType(is, binary, &update_period); ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &num_samples_history_); + ReadBasicType(is, binary, &num_samples_history); ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &alpha_); + ReadBasicType(is, binary, &alpha); + + preconditioner_in_.SetNumSamplesHistory(num_samples_history); + preconditioner_out_.SetNumSamplesHistory(num_samples_history); + preconditioner_in_.SetAlpha(alpha); + preconditioner_out_.SetAlpha(alpha); + preconditioner_in_.SetRank(rank_in); + preconditioner_out_.SetRank(rank_out); + preconditioner_out_.SetUpdatePeriod(update_period); + if (PeekToken(is, binary) == 'M') { // MaxChangePerSample, long ago removed; back compatibility. ExpectToken(is, binary, ""); @@ -2858,7 +2729,6 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { if (token.find("NaturalGradientAffineComponent>") == std::string::npos) KALDI_ERR << "Expected or " << ", got " << token; - SetNaturalGradientConfigs(); } @@ -2868,30 +2738,21 @@ NaturalGradientAffineComponent::NaturalGradientAffineComponent( AffineComponent(linear_params, bias_params, 0.001) { KALDI_ASSERT(bias_params.Dim() == linear_params.NumRows() && bias_params.Dim() != 0); - num_samples_history_ = 2000.0; - alpha_ = 4.0; - rank_in_ = 20; - rank_out_ = 80; - update_period_ = 4; - SetNaturalGradientConfigs(); + + // set some default natural gradient configs. + preconditioner_in_.SetRank(20); + preconditioner_out_.SetRank(80); + preconditioner_in_.SetUpdatePeriod(4); + preconditioner_out_.SetUpdatePeriod(4); } void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { bool ok = true; std::string matrix_filename; - num_samples_history_ = 2000.0; - alpha_ = 4.0; - rank_in_ = 20; - rank_out_ = 80; - update_period_ = 4; + is_gradient_ = false; // not configurable; there's no reason you'd want this InitLearningRatesFromConfig(cfl); - cfl->GetValue("num-samples-history", &num_samples_history_); - cfl->GetValue("alpha", &alpha_); - cfl->GetValue("rank-in", &rank_in_); - cfl->GetValue("rank-out", &rank_out_); - cfl->GetValue("update-period", &update_period_); if (cfl->GetValue("matrix", &matrix_filename)) { CuMatrix mat; @@ -2930,23 +2791,34 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { bias_params_.Scale(bias_stddev); bias_params_.Add(bias_mean); } + + orthonormal_constraint_ = 0.0; + cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); + + // Set natural-gradient configs. + BaseFloat num_samples_history = 2000.0, + alpha = 4.0; + int32 rank_in = 20, rank_out = 80, + update_period = 4; + cfl->GetValue("num-samples-history", &num_samples_history); + cfl->GetValue("alpha", &alpha); + cfl->GetValue("rank-in", &rank_in); + cfl->GetValue("rank-out", &rank_out); + cfl->GetValue("update-period", &update_period); + + preconditioner_in_.SetNumSamplesHistory(num_samples_history); + preconditioner_out_.SetNumSamplesHistory(num_samples_history); + preconditioner_in_.SetAlpha(alpha); + preconditioner_out_.SetAlpha(alpha); + preconditioner_in_.SetRank(rank_in); + preconditioner_out_.SetRank(rank_out); + preconditioner_out_.SetUpdatePeriod(update_period); + if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); - SetNaturalGradientConfigs(); -} - -void NaturalGradientAffineComponent::SetNaturalGradientConfigs() { - preconditioner_in_.SetRank(rank_in_); - preconditioner_in_.SetNumSamplesHistory(num_samples_history_); - preconditioner_in_.SetAlpha(alpha_); - preconditioner_in_.SetUpdatePeriod(update_period_); - preconditioner_out_.SetRank(rank_out_); - preconditioner_out_.SetNumSamplesHistory(num_samples_history_); - preconditioner_out_.SetAlpha(alpha_); - preconditioner_out_.SetUpdatePeriod(update_period_); } void NaturalGradientAffineComponent::Write(std::ostream &os, @@ -2957,28 +2829,30 @@ void NaturalGradientAffineComponent::Write(std::ostream &os, WriteToken(os, binary, ""); bias_params_.Write(os, binary); WriteToken(os, binary, ""); - WriteBasicType(os, binary, rank_in_); + WriteBasicType(os, binary, preconditioner_in_.GetRank()); WriteToken(os, binary, ""); - WriteBasicType(os, binary, rank_out_); + WriteBasicType(os, binary, preconditioner_out_.GetRank()); + if (orthonormal_constraint_ != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, orthonormal_constraint_); + } WriteToken(os, binary, ""); - WriteBasicType(os, binary, update_period_); + WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod()); WriteToken(os, binary, ""); - WriteBasicType(os, binary, num_samples_history_); + WriteBasicType(os, binary, preconditioner_in_.GetNumSamplesHistory()); WriteToken(os, binary, ""); - WriteBasicType(os, binary, alpha_); + WriteBasicType(os, binary, preconditioner_in_.GetAlpha()); WriteToken(os, binary, ""); } std::string NaturalGradientAffineComponent::Info() const { std::ostringstream stream; - stream << UpdatableComponent::Info(); - PrintParameterStats(stream, "linear-params", linear_params_); - PrintParameterStats(stream, "bias", bias_params_, true); - stream << ", rank-in=" << rank_in_ - << ", rank-out=" << rank_out_ - << ", num-samples-history=" << num_samples_history_ - << ", update-period=" << update_period_ - << ", alpha=" << alpha_; + stream << AffineComponent::Info(); + stream << ", rank-in=" << preconditioner_in_.GetRank() + << ", rank-out=" << preconditioner_out_.GetRank() + << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory() + << ", update-period=" << preconditioner_in_.GetUpdatePeriod() + << ", alpha=" << preconditioner_in_.GetAlpha(); return stream.str(); } @@ -2989,15 +2863,8 @@ Component* NaturalGradientAffineComponent::Copy() const { NaturalGradientAffineComponent::NaturalGradientAffineComponent( const NaturalGradientAffineComponent &other): AffineComponent(other), - rank_in_(other.rank_in_), - rank_out_(other.rank_out_), - update_period_(other.update_period_), - num_samples_history_(other.num_samples_history_), - alpha_(other.alpha_), preconditioner_in_(other.preconditioner_in_), - preconditioner_out_(other.preconditioner_out_) { - SetNaturalGradientConfigs(); -} + preconditioner_out_(other.preconditioner_out_) { } void NaturalGradientAffineComponent::Update( const std::string &debug_info, @@ -3020,8 +2887,8 @@ void NaturalGradientAffineComponent::Update( // than having the matrices scaled inside the preconditioning code). BaseFloat in_scale, out_scale; - preconditioner_in_.PreconditionDirections(&in_value_temp, NULL, &in_scale); - preconditioner_out_.PreconditionDirections(&out_deriv_temp, NULL, &out_scale); + preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale); + preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale); // "scale" is a scaling factor coming from the PreconditionDirections calls // (it's faster to have them output a scaling factor than to have them scale @@ -3075,6 +2942,12 @@ void LinearComponent::Read(std::istream &is, bool binary) { KALDI_ASSERT(token == ""); ExpectToken(is, binary, ""); params_.Read(is, binary); + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &orthonormal_constraint_); + } else { + orthonormal_constraint_ = 0.0; + } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &use_natural_gradient_); @@ -3137,11 +3010,14 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { BaseFloat alpha = 4.0, num_samples_history = 2000.0; + use_natural_gradient_ = true; + cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); cfl->GetValue("rank-in", &rank_in); cfl->GetValue("rank-out", &rank_out); cfl->GetValue("update-period", &update_period); + cfl->GetValue("use-natural-gradient", &use_natural_gradient_); preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); @@ -3152,6 +3028,9 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { preconditioner_in_.SetUpdatePeriod(update_period); preconditioner_out_.SetUpdatePeriod(update_period); + orthonormal_constraint_ = 0.0; + cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); + if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); @@ -3163,6 +3042,10 @@ void LinearComponent::Write(std::ostream &os, WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate WriteToken(os, binary, ""); params_.Write(os, binary); + if (orthonormal_constraint_ != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, orthonormal_constraint_); + } WriteToken(os, binary, ""); WriteBasicType(os, binary, use_natural_gradient_); @@ -3186,11 +3069,17 @@ void LinearComponent::Write(std::ostream &os, std::string LinearComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info(); - PrintParameterStats(stream, "params", params_); + PrintParameterStats(stream, "params", params_, + false, // include_mean + true, // include_row_norms + true, // include_column_norms + GetVerboseLevel() >= 2); // include_singular_values + if (orthonormal_constraint_ != 0.0) + stream << ", orthonormal-constraint=" << orthonormal_constraint_; stream << ", use-natural-gradient=" << (use_natural_gradient_ ? "true" : "false") << ", rank-in=" << preconditioner_in_.GetRank() - << ", rank-out=" << preconditioner_in_.GetRank() + << ", rank-out=" << preconditioner_out_.GetRank() << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory() << ", update-period=" << preconditioner_in_.GetUpdatePeriod() @@ -3228,9 +3117,9 @@ void LinearComponent::Backprop(const std::string &debug_info, // than having the matrices scaled inside the preconditioning code). BaseFloat in_scale, out_scale; to_update->preconditioner_in_.PreconditionDirections(&in_value_temp, - NULL, &in_scale); + &in_scale); to_update->preconditioner_out_.PreconditionDirections(&out_deriv_temp, - NULL, &out_scale); + &out_scale); BaseFloat local_lrate = in_scale * out_scale * to_update->learning_rate_; to_update->params_.AddMatMat(local_lrate, out_deriv_temp, kTrans, @@ -3252,12 +3141,14 @@ LinearComponent::LinearComponent( const LinearComponent &other): UpdatableComponent(other), params_(other.params_), + orthonormal_constraint_(other.orthonormal_constraint_), use_natural_gradient_(other.use_natural_gradient_), preconditioner_in_(other.preconditioner_in_), preconditioner_out_(other.preconditioner_out_) { } LinearComponent::LinearComponent(const CuMatrix ¶ms): params_(params), + orthonormal_constraint_(0.0), use_natural_gradient_(true) { // Set defaults for natural gradient. preconditioner_in_.SetRank(40); @@ -3549,6 +3440,13 @@ void SoftmaxComponent::Backprop(const std::string &debug_info, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { + + if (to_update_in) { + SoftmaxComponent *to_update = + dynamic_cast(to_update_in); + to_update->StoreBackpropStats(out_deriv); + } + if (in_deriv == NULL) return; /* @@ -3588,8 +3486,13 @@ void LogSoftmaxComponent::Backprop(const std::string &debug_info, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, - Component *, // to_update + Component *to_update_in, CuMatrixBase *in_deriv) const { + if (to_update_in) { + LogSoftmaxComponent *to_update = + dynamic_cast(to_update_in); + to_update->StoreBackpropStats(out_deriv); + } if (in_deriv == NULL) return; in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv); @@ -3902,7 +3805,7 @@ void NaturalGradientPerElementScaleComponent::Update( // scales_.AddRowSumMat(learning_rate_, derivs_per_frame). BaseFloat scale; - preconditioner_.PreconditionDirections(&derivs_per_frame, NULL, &scale); + preconditioner_.PreconditionDirections(&derivs_per_frame, &scale); CuVector delta_scales(scales_.Dim()); delta_scales.AddRowSumMat(scale * learning_rate_, derivs_per_frame); @@ -5775,7 +5678,7 @@ void LstmNonlinearityComponent::Backprop( BaseFloat scale = 1.0; if (!to_update->is_gradient_) { to_update->preconditioner_.PreconditionDirections( - ¶ms_deriv, NULL, &scale); + ¶ms_deriv, &scale); } to_update->params_.AddMat(to_update->learning_rate_ * scale, params_deriv); @@ -5883,489 +5786,6 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { } } - - -void BatchNormComponent::ComputeDerived() { - if (!test_mode_) { - offset_.Resize(0); - scale_.Resize(0); - return; - } - - if (count_ == 0.0) { - KALDI_WARN << "Test-mode is set but there is no data count. " - "Creating random counts. This only makes sense " - "in unit-tests (or compute_prob_*.0.log). If you see this " - "elsewhere, something is very wrong."; - count_ = 1.0; - stats_sum_.SetRandn(); - stats_sumsq_.SetRandn(); - stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); - } - - offset_.Resize(block_dim_); - scale_.Resize(block_dim_); - offset_.CopyFromVec(stats_sum_); - offset_.Scale(-1.0 / count_); - // now offset_ is -mean. - scale_.CopyFromVec(stats_sumsq_); - scale_.Scale(1.0 / count_); - scale_.AddVecVec(-1.0, offset_, offset_, 1.0); - // now scale_ is variance. - // Mathematically the ApplyFloor statement should be a no-op; this is in case - // of numerical roundoff. - scale_.ApplyFloor(0.0); - scale_.Add(epsilon_); - scale_.ApplyPow(-0.5); - // now scale_ = min(variance, epsilon)^{-0.5}. - // next, multiply by the target RMS (normally 1.0). - scale_.Scale(target_rms_); - offset_.MulElements(scale_); - // now offset_ is -(scale*mean). -} - -void BatchNormComponent::SetTestMode(bool test_mode) { - test_mode_ = test_mode; - ComputeDerived(); -} - -void BatchNormComponent::Check() const { - KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 && - epsilon_ > 0.0 && target_rms_ > 0.0); -} - -BatchNormComponent::BatchNormComponent(const BatchNormComponent &other): - dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_), - target_rms_(other.target_rms_), test_mode_(other.test_mode_), - count_(other.count_), stats_sum_(other.stats_sum_), - stats_sumsq_(other.stats_sumsq_) { - ComputeDerived(); - Check(); -} - - -std::string BatchNormComponent::Info() const { - std::ostringstream stream; - stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ - << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ - << ", count=" << count_ - << ", test-mode=" << (test_mode_ ? "true" : "false"); - if (count_ > 0) { - Vector mean(stats_sum_), var(stats_sumsq_); - mean.Scale(1.0 / count_); - var.Scale(1.0 / count_); - // subtract mean^2 from var. - var.AddVecVec(-1.0, mean, mean, 1.0); - var.ApplyFloor(0.0); - var.ApplyPow(0.5); // make it the stddev. - stream << ", data-mean=" << SummarizeVector(mean) - << ", data-stddev=" << SummarizeVector(var); - } - return stream.str(); -} - -void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { - dim_ = -1; - block_dim_ = -1; - epsilon_ = 1.0e-03; - target_rms_ = 1.0; - test_mode_ = false; - bool ok = cfl->GetValue("dim", &dim_); - cfl->GetValue("block-dim", &block_dim_); - cfl->GetValue("epsilon", &epsilon_); - cfl->GetValue("target-rms", &target_rms_); - cfl->GetValue("test-mode", &test_mode_); - if (!ok || dim_ <= 0) { - KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0"; - } - if (block_dim_ == -1) - block_dim_ = dim_; - if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 && - epsilon_ > 0 && target_rms_ > 0)) - KALDI_ERR << "Invalid configuration in BatchNormComponent."; - if (cfl->HasUnusedValues()) - KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); - count_ = 0; - stats_sum_.Resize(block_dim_); - stats_sumsq_.Resize(block_dim_); - if (test_mode_) { - ComputeDerived(); - } -} - - - -/* - BATCH_NORM_MATH - - This comment describes the equations involved in batch normalization, and - derives the forward and back-propagation. - - This is all dimension-by-dimension, so we just imagine the inputs - are scalars x(i), for i=0 .. n-1. - - FORWARD PASS: - - Define xsum = sum_i x(i) - x2sum = sum_i x(i)^2 - mean = xsum / n - var = x2sum / n - (mean*mean) - scale = (var + epsilon)^{-0.5} - offset = -mean * scale - - y(i) = scale * x(i) + offset - - Most of the rest of this comment derives how to compute the derivatives. If - you just want the formulas, please skip to the string 'BACKWARD PASS' below. - - We'll use a notation where an apostrophe on something means (the derivative of - the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on. - We are given y'(i). Propagating the derivatives backward: - offset' = sum_i y'(i) - scale' = (sum_i y'(i) * x(i)) - offset' * mean - var' = scale' * -0.5 * (var + epsilon)^{-1.5} - = -0.5 * scale' * scale^3 - mean' = -offset' * scale - 2 * mean * var' - xsum' = mean' / n - x2sum' = var' / n - - So the derivatives propagated back to the original data are: - x'(i) = y'(i) * scale + xsum' + x(i) * x2sum' - - The above is quite complicated to compute, but we can use some invariances - to work out a simpler way to compute the derivatives. - - Firstly, note that x'(i) is of the form: - - x'(i) = y'(i) * scale + [affine function of x(i)]. - - [it's a 1-d affine function, i.e. offset and scale]. - This has the same functional form as: - - x'(i) = y'(i) * scale + [affine function of y(i)]. - - since y(i) is an affine function of x(i) with nonzero scale. - Because the output is invariant to shifts in the input, sum_i x'(i) - will be zero. This is sufficient to determine the bias - term in the affine function. [Note: the scale on y(i) doesn't - come into it because the y(i) sum to zero]. The offset - will just be (sum_i y'(i) * scale / n); this makes the sum of x'(i) zero. - So let's write it as - - x'(i) = (y'(i) - 1/n sum_i y'(i)) * scale + alpha y(i). - - and it will be convenient to define: - - x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - - which is just y'(i) with mean subtraction, scaled according to - the scale used in the normalization. So write - - x'(i) = x_deriv_base(i) + alpha y(i). - - The question is, what is the scale alpha. We don't actually need to - do any differentiation to figure this out. First, assume there is - no "+ epsilon" in the variance; later we'll explain why this doesn't - matter. The key to working out alpha is that the output is invariant - to scaling of the input. Assume we scale around the input's mean, - since that makes the math simpler. We can express this by the - constraint that (\sum_i x'(i) * (x(i) - avg-x)) = 0. This is - equivalent to the constraint that (\sum_i x'(i) y (i)) = 0, since - y(i) is x(i) - avg-x times a nonzero scale. We'll use this contraint - to determine alpha, Using the above expressionfor x(i), we can write - this constraint as: - \sum_i ( y(i) x_deriv_base(i) + alpha y(i) y(i)) = 0. - Now, since we said we'd ignore the epsilon, the output has unit variance, - so we know that \sum_i y(i) y(i) = n. - So alpha = - \sum_i y(i) x_deriv_base(i) / n. We can actually re-imagine - the epsilon term (or variance-flooring) as having been implemented by - adding a couple extra rows to the matrix with suitable values, and zero - output-deriv for those rows. If you think about it carefully you'll see that - the formula above is valid even if there is an extra term - in the variance. Anyway the correctness of the derivative will get tested - throughly by the component unit-tests. - - So to recap, here is the backprop. - - BACKWARD PASS: - - We are given y'(i), scale, and y(i). - - We compute: - x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - alpha = - \sum_i y(i) x_deriv_base(i) / n - x'(i) = x_deriv_base(i) + alpha y(i) - */ - - - -void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - KALDI_ASSERT(SameDim(in, *out) && - (in.NumCols() == dim_ || in.NumCols() == block_dim_)); - if (in.NumCols() != block_dim_) { - // if block_dim_ != dim_, we recurse; this helps keep the main code - // simple. - KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); - int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(), - orig_cols = in.NumCols(), new_rows = orig_rows * ratio, - new_cols = orig_cols / ratio; - CuSubMatrix in_reshaped(in.Data(), new_rows, new_cols, new_cols), - out_reshaped(out->Data(), new_rows, new_cols, new_cols); - return Propagate(indexes, in_reshaped, &out_reshaped); - } - - // From this point, we can assume that the num-cols of 'in' and 'out' - // equals block_dim_. - - if (!test_mode_) { - // search in the comment above for FORWARD PASS to see what is being - // implemented here. - // if this takes too much time due to multiple different CUDA calls, - // we'll consider making a single kernel for some of it. - Memo *memo = new Memo; - int32 num_frames = in.NumRows(), dim = block_dim_; - memo->num_frames = num_frames; - memo->mean_uvar_scale.Resize(4, dim); - CuSubVector mean(memo->mean_uvar_scale, 0), - uvar(memo->mean_uvar_scale, 1), - scale(memo->mean_uvar_scale, 2); - mean.AddRowSumMat(1.0 / num_frames, in, 0.0); - uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0); - scale.CopyFromVec(uvar); - // by applying this scale at this point, we save a multiply later on. - BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_); - scale.AddVecVec(-var_scale, mean, mean, var_scale); - // at this point, 'scale' contains just the variance [divided by target-rms^2]. - scale.ApplyFloor(0.0); - scale.Add(var_scale * epsilon_); - // Now 'scale' contains the variance floored to zero and then with epsilon - // added [both divided by target-rms^2]. - scale.ApplyPow(-0.5); - // now 'scale' is the actual scale we'll use. - - // the next command will do no work if out == in, for in-place propagation. - out->CopyFromMat(in); - out->AddVecToRows(-1.0, mean, 1.0); - out->MulColsVec(scale); - return static_cast(memo); - } else { - if (offset_.Dim() != block_dim_) { - if (count_ == 0) - KALDI_ERR << "Test mode set in BatchNormComponent, but no stats."; - else // why was ComputeDerived() not called? - KALDI_ERR << "Code error in BatchNormComponent"; - } - out->CopyFromMat(in); - out->MulColsVec(scale_); - out->AddVecToRows(1.0, offset_, 1.0); - return NULL; - } -} - -void BatchNormComponent::Backprop( - const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, // unused - const CuMatrixBase &out_value, - const CuMatrixBase &out_deriv, - void *memo_in, - Component *to_update, // unused - CuMatrixBase *in_deriv) const { - - KALDI_ASSERT(SameDim(out_value, out_deriv) && - SameDim(out_value, *in_deriv) && - (out_value.NumCols() == dim_ || - out_value.NumCols() == block_dim_)); - if (out_value.NumCols() != block_dim_) { - // if block_dim_ != dim_, we recurse; this helps keep the main code - // simple. - KALDI_ASSERT(out_value.Stride() == out_value.NumCols() && - out_deriv.Stride() == out_deriv.NumCols() && - in_deriv->Stride() == in_deriv->NumCols()); - int32 ratio = dim_ / block_dim_, - orig_rows = out_value.NumRows(), - orig_cols = out_value.NumCols(), - new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; - CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, - new_cols, new_cols), - out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols), - in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols); - // we'll never use in_value, so pass it in unchanged. - Backprop(debug_info, indexes, in_value, - out_value_reshaped, out_deriv_reshaped, - memo_in, to_update, &in_deriv_reshaped); - return; - } - - Memo *memo = static_cast(memo_in); - - if (!test_mode_) { - // search above for BACKWARD PASS for a comment describing the math. - KALDI_ASSERT(memo != NULL && "memo not passed into backprop"); - int32 num_frames = memo->num_frames; - KALDI_ASSERT(out_value.NumRows() == num_frames); - CuSubVector temp(memo->mean_uvar_scale, 3), - scale(memo->mean_uvar_scale, 2); - temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0); - // the following does no work if in_deriv and out_deriv are the same matrix. - in_deriv->CopyFromMat(out_deriv); - in_deriv->AddVecToRows(1.0, temp); - in_deriv->MulColsVec(scale); - // at this point, 'in_deriv' contains: - // x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - temp.AddDiagMatMat(-1.0 / (num_frames * target_rms_ * target_rms_), - out_value, kTrans, *in_deriv, kNoTrans, 0.0); - // now, 'temp' contains the quantity which we described - // in the math as: - // alpha = - \sum_i y(i) x_deriv_base(i) / n. - // The factor 1 / (target_rms_ * target_rms_) comes from following - // this additional scaling factor through the math. In the comment I said - // "we know that \sum_i y(i) y(i) = n". Taking target-rms into account - // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2". - in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0); - // At this point, in_deriv contains x'(i) = x_deriv_base(i) + alpha y(i). - - } else { - KALDI_ASSERT(offset_.Dim() == block_dim_); - // the next call does no work if they point to the same memory. - in_deriv->CopyFromMat(out_deriv); - in_deriv->MulColsVec(scale_); - } -} - -void BatchNormComponent::StoreStats( - const CuMatrixBase &in_value, - const CuMatrixBase &out_value, - void *memo_in) { - // in test mode this component does not store stats, it doesn't provide the - // kStoresStats flag. - KALDI_ASSERT(!test_mode_); - KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_); - if (out_value.NumCols() != block_dim_) { - // if block_dim_ != dim_, we recurse; this helps keep the main code - // simple. - KALDI_ASSERT(out_value.Stride() == out_value.NumCols()); - int32 ratio = dim_ / block_dim_, - orig_rows = out_value.NumRows(), - orig_cols = out_value.NumCols(), - new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; - CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, - new_cols, new_cols); - // we'll never use in_value, so just pass it in unchanged. - StoreStats(in_value, out_value_reshaped, memo_in); - return; - } - - Memo *memo = static_cast(memo_in); - KALDI_ASSERT(out_value.NumRows() == memo->num_frames); - - CuSubVector mean(memo->mean_uvar_scale, 0), - uvar(memo->mean_uvar_scale, 1); - KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0); - BaseFloat num_frames = memo->num_frames; - if (stats_sum_.Dim() != block_dim_) { - stats_sum_.Resize(block_dim_); - stats_sumsq_.Resize(block_dim_); - KALDI_ASSERT(count_ == 0); - } - count_ += num_frames; - stats_sum_.AddVec(num_frames, mean, 1.0); - stats_sumsq_.AddVec(num_frames, uvar, 1.0); -} - -void BatchNormComponent::Read(std::istream &is, bool binary) { - ExpectOneOrTwoTokens(is, binary, "", ""); - ReadBasicType(is, binary, &dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &block_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &epsilon_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &target_rms_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &test_mode_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &count_); - ExpectToken(is, binary, ""); - stats_sum_.Read(is, binary); - ExpectToken(is, binary, ""); - stats_sumsq_.Read(is, binary); - stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); - stats_sum_.Scale(count_); - stats_sumsq_.Scale(count_); - ExpectToken(is, binary, ""); - ComputeDerived(); - Check(); -} - -void BatchNormComponent::Write(std::ostream &os, bool binary) const { - Check(); - WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, block_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, epsilon_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, target_rms_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, test_mode_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, count_); - CuVector mean(stats_sum_), var(stats_sumsq_); - if (count_ != 0) { - mean.Scale(1.0 / count_); - var.Scale(1.0 / count_); - var.AddVecVec(-1.0, mean, mean, 1.0); - } - WriteToken(os, binary, ""); - mean.Write(os, binary); - WriteToken(os, binary, ""); - var.Write(os, binary); - WriteToken(os, binary, ""); -} - -void BatchNormComponent::Scale(BaseFloat scale) { - if (scale == 0) { - count_ = 0.0; - stats_sum_.SetZero(); - stats_sumsq_.SetZero(); - } else { - count_ *= scale; - stats_sum_.Scale(scale); - stats_sumsq_.Scale(scale); - } -} - - -void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) { - const BatchNormComponent *other = - dynamic_cast(&other_in); - count_ += alpha * other->count_; - stats_sum_.AddVec(alpha, other->stats_sum_); - stats_sumsq_.AddVec(alpha, other->stats_sumsq_); - // this operation might change offset_ and scale_, so we recompute them - // in this instance (but not in Scale()). - ComputeDerived(); -} - -void BatchNormComponent::ZeroStats() { - // We only zero the stats if we're not in test mode. In test mode, this would - // be dangerous as the stats are the source for the transform, and zeroing - // them and then calling ComputeDerived() again would remove the transform - // parameters (offset_ and scale_). - if (!test_mode_) { - count_ = 0.0; - stats_sum_.SetZero(); - stats_sumsq_.SetZero(); - } -} - - SumBlockComponent::SumBlockComponent(const SumBlockComponent &other): input_dim_(other.input_dim_), output_dim_(other.output_dim_), scale_(other.scale_) { } diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index d7cece06284..b1eb30a55bf 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -40,6 +40,9 @@ namespace nnet3 { /// output for one input, and return the kSimpleComponent flag in their /// Properties(): for example, tanh and affine components. In /// nnet-general-component.h there are components that don't fit this pattern. +/// +/// Some components that do provide the kSimpleComponent flag are not declared +/// here: see also nnet-normalize-component.h. // This "nnet3" version of the p-norm component only supports the 2-norm. class PnormComponent: public Component { @@ -186,82 +189,6 @@ class ElementwiseProductComponent: public Component { int32 output_dim_; }; -/* - Implements the function: - - y = x * (sqrt(dim(x)) * target-rms) / |x| - - where |x| is the 2-norm of the vector x. I.e. its output is its input - scaled such that the root-mean-square values of its elements equals - target-rms. (As a special case, if the input is zero, it outputs zero). - - Note: if you specify add-log-stddev=true, it adds an extra element to - y which equals log(|x| / sqrt(dim(x))). - - - Configuration values accepted: - dim, or input-dim Input dimension of this component, e.g. 1024. - Will be the same as the output dimension if add-log-stddev=false. - block-dim Defaults to 'dim' you may specify a nonzero divisor - of 'dim'. In this case the input dimension will - be interpreted as blocks of dimension 'block-dim' - to which the nonlinearity described above is applied - separately. - add-log-stddev You can set this to true to add an extra output - dimension which will equal |x| / sqrt(dim(x)). - If block-dim is specified, this is done per block. - target-rms This defaults to 1.0, but if set it to another - (nonzero) value, the output will be scaled by this - factor. - */ -class NormalizeComponent: public Component { - public: - explicit NormalizeComponent(const NormalizeComponent &other); - - virtual int32 Properties() const { - return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds| - (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) | - (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0); - } - NormalizeComponent() { } - virtual std::string Type() const { return "NormalizeComponent"; } - virtual void InitFromConfig(ConfigLine *cfl); - virtual Component* Copy() const { return new NormalizeComponent(*this); } - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update, - CuMatrixBase *in_deriv) const; - - virtual void Read(std::istream &is, bool binary); - virtual void Write(std::ostream &os, bool binary) const; - virtual int32 InputDim() const { return input_dim_; } - virtual int32 OutputDim() const { - return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0)); - } - virtual std::string Info() const; - private: - NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow. - enum { kExpSquaredNormFloor = -66 }; - // kSquaredNormFloor is about 0.7e-20. We need a value that's exactly representable in - // float and whose inverse square root is also exactly representable - // in float (hence, an even power of two). - static const BaseFloat kSquaredNormFloor; - int32 input_dim_; - int32 block_dim_; - BaseFloat target_rms_; // The target rms for outputs, default 1.0. - - bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D))) - // is an extra dimension of the output. -}; - - /* Implements the sigmoid nonlinearity, i.e. the function y = exp(-x). @@ -463,10 +390,11 @@ class AffineComponent: public UpdatableComponent { virtual int32 InputDim() const { return linear_params_.NumCols(); } virtual int32 OutputDim() const { return linear_params_.NumRows(); } + BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; } virtual std::string Info() const; virtual void InitFromConfig(ConfigLine *cfl); - AffineComponent() { } // use Init to really initialize. + AffineComponent(): orthonormal_constraint_(0.0) { } // use Init to really initialize. virtual std::string Type() const { return "AffineComponent"; } virtual int32 Properties() const { return kSimpleComponent|kUpdatableComponent| @@ -507,6 +435,7 @@ class AffineComponent: public UpdatableComponent { const CuMatrixBase &linear); const CuVector &BiasParams() const { return bias_params_; } const CuMatrix &LinearParams() const { return linear_params_; } + CuMatrix &LinearParams() { return linear_params_; } explicit AffineComponent(const AffineComponent &other); // The next constructor is used in converting from nnet1. AffineComponent(const CuMatrixBase &linear_params, @@ -539,6 +468,7 @@ class AffineComponent: public UpdatableComponent { const AffineComponent &operator = (const AffineComponent &other); // Disallow. CuMatrix linear_params_; CuVector bias_params_; + BaseFloat orthonormal_constraint_; }; class RepeatedAffineComponent; @@ -828,6 +758,19 @@ class LogSoftmaxComponent: public NonlinearComponent { Dimension is output-dim by (input-dim + 1), last column is interpreted as the bias. + Other options: + orthonormal-constraint=0.0 If you set this to 1.0, then + the linear_params_ matrix will be (approximately) + constrained during training to have orthonormal rows + (or columns, whichever is fewer). You can choose a + positive nonzero value different than 1.0 to have a + scaled orthonormal matrix, i.e. with singular values + at the selected value (e.g. 0.5, or 2.0). This is + not enforced inside the component itself; you have to + call ConstrainOrthonormal() from the training code to + do this. All this component does is return the + OrthonormalConstraint() value. + Options to the natural gradient (you won't normally have to set these, the defaults are suitable): @@ -871,22 +814,10 @@ class NaturalGradientAffineComponent: public AffineComponent { NaturalGradientAffineComponent &operator= ( const NaturalGradientAffineComponent&); - // Configs for preconditioner. The input side tends to be better conditioned -> - // smaller rank needed, so make them separately configurable. - int32 rank_in_; - int32 rank_out_; - int32 update_period_; - BaseFloat num_samples_history_; - BaseFloat alpha_; - OnlineNaturalGradient preconditioner_in_; OnlineNaturalGradient preconditioner_out_; - // Sets the configs rank, alpha and eta in the preconditioner objects, - // from the class variables. - void SetNaturalGradientConfigs(); - virtual void Update( const std::string &debug_info, const CuMatrixBase &in_value, @@ -919,6 +850,16 @@ class NaturalGradientAffineComponent: public AffineComponent { bias-stddev, bias-mean) to initialize the parameters. Dimension is output-dim by (input-dim + 1), last column is interpreted as the bias. + orthonormal-constraint=0.0 If you set this to 1.0, then + this matrix will be (approximately) constrained during + training to have orthonormal rows (or columns, whichever + is fewer). You can choose a positive nonzero value different + than 1.0 to have a scaled orthonormal matrix, i.e. with singular + values at the selected value (e.g. 0.5, or 2.0). + This is not enforced inside the component + itself; you have to call ConstrainOrthonormal() + from the training code to do this. All this component + does is return the OrthonormalConstraint() value. Options to the natural gradient (you won't normally have to set these, the defaults are suitable): @@ -982,14 +923,19 @@ class LinearComponent: public UpdatableComponent { explicit LinearComponent(const LinearComponent &other); explicit LinearComponent(const CuMatrix ¶ms); + + BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; } + CuMatrixBase &Params() { return params_; } + const CuMatrixBase &Params() const { return params_; } private: // disallow assignment operator. LinearComponent &operator= ( const LinearComponent&); - CuMatrix params_; + + BaseFloat orthonormal_constraint_; // If true (and if no this->is_gradient_), use natural gradient updates. bool use_natural_gradient_; OnlineNaturalGradient preconditioner_in_; @@ -1460,8 +1406,12 @@ class PermuteComponent: public Component { -// PerElementScaleComponent scales each dimension of its input with a separate -// trainable scale; it's like a linear component with a diagonal matrix. +/** + PerElementScaleComponent scales each dimension of its input with a separate + trainable scale; it's like a linear component with a diagonal matrix. This + version (and its child class NaturalGradientPerElementScaleComponent) + requires the input for backprop. See also ScaleAndOffsetComponent. +*/ class PerElementScaleComponent: public UpdatableComponent { public: virtual int32 InputDim() const { return scales_.Dim(); } @@ -1474,7 +1424,7 @@ class PerElementScaleComponent: public UpdatableComponent { virtual std::string Type() const { return "PerElementScaleComponent"; } virtual int32 Properties() const { return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput| - kPropagateInPlace; + kPropagateInPlace|kBackpropInPlace; } virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, @@ -1686,8 +1636,7 @@ class ConstantFunctionComponent: public UpdatableComponent { // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but -// it uses a natural gradient update for the per-element scales, and enforces a -// maximum amount of change per minibatch, for stability. +// it uses a natural gradient update for the per-element scales. class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent { public: @@ -2384,183 +2333,6 @@ class MaxpoolingComponent: public Component { }; -/* - BatchNormComponent - - This implements batch normalization; for each dimension of the - input it normalizes the data to be zero-mean, unit-variance. You - can set the block-dim configuration value to implement spatial - batch normalization, see the comment for the variable. - - If you want to combine this with the trainable offset and scale that the - original BatchNorm paper used, then follow this by the - ScaleAndOffsetComponent. - - It's a simple component (uses the kSimpleComponent flag), but it is unusual in - that it will give different results if you call it on half the matrix at a - time. Most of the time this would be pretty harmless, so we still return the - kSimpleComponent flag. We may have to modify the test code a little to - account for this, or possibly remove the kSimpleComponent flag. In some sense - each output Index depends on every input Index, but putting those dependencies - explicitly into the dependency-tracking framework as a GeneralComponent - would be very impractical and might lead to a lot of unnecessary things being - computed. You have to be a bit careful where you put this component, and understand - what you're doing e.g. putting it in the path of a recurrence is a bit problematic - if the minibatch size is small. - - Accepted configuration values: - dim Dimension of the input and output - block-dim Defaults to 'dim', but may be set to a nonzero divisor - of 'dim'. In this case, each block of dimension 'block-dim' - is treated like a separate row of the input matrix, which - means that the stats from n'th element of each - block are pooled into one class, for each n.a - epsilon Small term added to the variance that is used to prevent - division by zero - target-rms This defaults to 1.0, but if set, for instance, to 2.0, - it will normalize the standard deviation of the output to - 2.0. 'target-stddev' might be a more suitable name, but this - was chosen for consistency with NormalizeComponent. - */ -class BatchNormComponent: public Component { - public: - - BatchNormComponent() { } - - // call this with 'true' to set 'test mode' where the batch normalization is - // done with stored stats. There won't normally be any need to specially - // accumulate these stats; they are stored as a matter of course on each - // iteration of training, as for NonlinearComponents, and we'll use the stats - // from the most recent [script-level] iteration. - void SetTestMode(bool test_mode); - - // constructor using another component - BatchNormComponent(const BatchNormComponent &other); - - virtual int32 InputDim() const { return dim_; } - virtual int32 OutputDim() const { return dim_; } - - virtual std::string Info() const; - virtual void InitFromConfig(ConfigLine *cfl); - virtual std::string Type() const { return "BatchNormComponent"; } - virtual int32 Properties() const { - // If the block-dim is less than the dim, we need the input and output - // matrices to be contiguous (stride==num-cols), as we'll be reshaping - // internally. This is not much of a cost, because this will be used - // in convnets where we have to do this anyway. - return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace| - kBackpropInPlace| - (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)| - (test_mode_ ? 0 : kUsesMemo|kStoresStats); - } - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *, // to_update, - CuMatrixBase *in_deriv) const; - - virtual void Read(std::istream &is, bool binary); // This Read function - // requires that the Component has the correct type. - - /// Write component to stream - virtual void Write(std::ostream &os, bool binary) const; - virtual Component* Copy() const { return new BatchNormComponent(*this); } - - virtual void Scale(BaseFloat scale); - virtual void Add(BaseFloat alpha, const Component &other); - virtual void ZeroStats(); - - - virtual void DeleteMemo(void *memo) const { delete static_cast(memo); } - - virtual void StoreStats(const CuMatrixBase &in_value, - const CuMatrixBase &out_value, - void *memo); - - // Members specific to this component type. - // Note: the offset and scale will only be nonempty in 'test mode'. - const CuVector &Offset() const { return offset_; } - const CuVector &Scale() const { return scale_; } - - private: - - struct Memo { - // number of frames (after any reshaping). - int32 num_frames; - // 'sum_sumsq_scale' is of dimension 4 by block_dim_: - // Row 0 = mean = the mean of the rows of the input - // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames). - // Row 2 = scale = the scale of the renormalization, which is - // Row 3 is used as a temporary in Backprop. - // the inverse stddev of the input (modified by epsilon_, - // see the Propagate function. - CuMatrix mean_uvar_scale; - }; - - void Check() const; - - // this function is used in a couple of places; it turns the raw stats into - // the offset/scale term of a normalizing transform. - static void ComputeOffsetAndScale(double count, - BaseFloat epsilon, - const Vector &stats_sum, - const Vector &stats_sumsq, - Vector *offset, - Vector *scale); - // computes derived parameters offset_ and scale_. - void ComputeDerived(); - - // Dimension of the input and output. - int32 dim_; - // This would normally be the same as dim_, but if it's less (and it must be > - // 0 and must divide dim_), then each separate block of the input of dimension - // 'block_dim_' is treated like a separate frame for the purposes of - // normalization. This can be used to implement spatial batch normalization - // for convolutional setups-- assuming the filter-dim has stride 1, which it - // always will in the new code in nnet-convolutional-component.h. - int32 block_dim_; - - // Used to avoid exact-zero variances, epsilon has the dimension of a - // covariance. - BaseFloat epsilon_; - - // This value will normally be 1.0, which is the default, but you can set it - // to other values as a way to control how fast the following layer learns - // (smaller -> slower). The same config exists in NormalizeComponent. - BaseFloat target_rms_; - - // This is true if we want the batch normalization to operate in 'test mode' - // meaning the data mean and stddev used for the normalization are fixed - // quantities based on previously accumulated stats. Note: the stats we use - // for this are based on the same 'StoreStats' mechanism as we use for - // components like SigmoidComponent and ReluComponent; we'll be using - // the stats from the most recent [script-level] iteration of training. - bool test_mode_; - - - // total count of stats stored by StoreStats(). - double count_; - // sum-of-data component of stats of input data. - CuVector stats_sum_; - // sum-of-squared component of stats of input data. - CuVector stats_sumsq_; - - // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they - // dictate the transform that is done in 'test mode'. They are set only when - // reading the model from disk and when calling SetTestMode(true); they are - // resized to empty when the stats are updated, to ensure that out-of-date - // values are not kept around. - CuVector offset_; - CuVector scale_; -}; - - /** CompositeComponent is a component representing a sequence of [simple] components. The config line would be something like the following diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 83b902a9b90..48a97df9ea1 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -1675,11 +1675,11 @@ static void GenerateRandomComponentConfig(std::string *component_type, // labels to the most recently added component, so it gets tested more case 31: { *component_type = "BatchNormComponent"; - int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2); + int32 block_dim = RandInt(1, 20), dim = block_dim * RandInt(1, 2); bool test_mode = (RandInt(0, 1) == 0); os << " dim=" << dim << " block-dim=" << block_dim << " target-rms=" - << RandInt(1, 2) << " test-mode=" + << RandInt(1, 4) << " test-mode=" << (test_mode ? "true" : "false") << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0"); break; diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index a9093523222..6bff30c501b 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -88,8 +88,11 @@ void NnetTrainer::Train(const NnetExample &eg) { void NnetTrainer::TrainInternal(const NnetExample &eg, const NnetComputation &computation) { + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. This is mainly important for memory-norm. NnetComputer computer(config_.compute_config, computation, - *nnet_, delta_nnet_); + nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.io); computer.Run(); @@ -112,6 +115,10 @@ void NnetTrainer::TrainInternal(const NnetExample &eg, // happens when we use the model with batchnorm test-mode set). ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_); + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + // Scale deta_nnet if (success) ScaleNnet(config_.momentum, delta_nnet_); @@ -122,8 +129,11 @@ void NnetTrainer::TrainInternal(const NnetExample &eg, void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg, const NnetComputation &computation, bool is_backstitch_step1) { + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. This is mainly important for memory-norm. NnetComputer computer(config_.compute_config, computation, - *nnet_, delta_nnet_); + nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.io); computer.Run(); @@ -159,6 +169,21 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg, max_change_scale, scale_adding, nnet_, &num_max_change_per_component_applied_, &num_max_change_global_applied_); + if (is_backstitch_step1) { + // The following will only do something if we have a LinearComponent or + // AffineComponent with orthonormal-constraint set to a nonzero value. We + // choose to do this only on the 1st backstitch step, for efficiency. + ConstrainOrthonormal(nnet_); + } + + if (!is_backstitch_step1) { + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). Do this + // after backstitch step 2 so that the stats are scaled down before we start + // the next minibatch. + ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_); + } + ScaleNnet(0.0, delta_nnet_); } diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 64fc3003609..fd2229cace8 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -22,6 +22,7 @@ #include "nnet3/nnet-utils.h" #include "nnet3/nnet-graph.h" #include "nnet3/nnet-simple-component.h" +#include "nnet3/nnet-normalize-component.h" #include "nnet3/nnet-general-component.h" #include "nnet3/nnet-convolutional-component.h" #include "nnet3/nnet-parse.h" @@ -491,9 +492,7 @@ void SetDropoutProportion(BaseFloat dropout_proportion, bool HasBatchnorm(const Nnet &nnet) { for (int32 c = 0; c < nnet.NumComponents(); c++) { const Component *comp = nnet.GetComponent(c); - const BatchNormComponent *bc = - dynamic_cast(comp); - if (bc != NULL) + if (dynamic_cast(comp) != NULL) return true; } return false; @@ -859,6 +858,105 @@ class SvdApplier { std::string component_name_pattern_; }; +// Does an update that moves M closer to being a (matrix with +// orthonormal rows) times 'scale'. Note: this will diverge if +// we start off with singular values too far from 'scale'. +void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase *M) { + // Larger alpha will update faster but will be more prone to instability. I + // believe the scalar value below shouldn't be more than 0.25 or maybe 0.5 or + // it will always be unstable. It should be > 0.0. + + // The factor of 1/scale^2 is, I *believe*, going to give us the right kind of + // invariance w.r.t. the scale. To explain why this is the appropriate + // factor, look at the statement M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, + // *M, kNoTrans, 0.0); where P is proportional to scale^2 and M to 'scale' and + // alpha to 1/scale^2, so change in M_update is proportional to 'scale'. + // We'd like 'M_update' to be proportional to 'scale'. This reasoning is very + // hand-wavey but I think it can be made rigorous. This is about remaining + // stable (not prone to divergence) even for very large or small values of + // 'scale'. + BaseFloat alpha = 0.125 / (scale * scale); + + // We'd like to enforce the rows of M to be orthonormal. + // define P = M M^T. If P is unit then M has orthonormal rows. + // We actually want P to equal scale^2 * I, so that M's rows are + // orthogonal with 2-norms equal to 'scale'. + // We (notionally) add to the objective function, the value + // -alpha times the sum of squared elements of Q = (P - scale^2 * I). + int32 rows = M->NumRows(), cols = M->NumCols(); + CuMatrix M_update(rows, cols); + CuMatrix P(rows, rows); + P.SymAddMat2(1.0, *M, kNoTrans, 0.0); + P.CopyLowerToUpper(); + P.AddToDiag(-1.0 * scale * scale); + + if (GetVerboseLevel() >= 1) { + BaseFloat error = P.FrobeniusNorm(); + KALDI_VLOG(2) << "Error in orthogonality is " << error; + } + + // At this point, the matrix P contains what, in the math, would be Q = + // P-scale^2*I. The derivative of the objective function w.r.t. an element q(i,j) + // of Q is now equal to -2*alpha*q(i,j), i.e. we could write q_deriv(i,j) + // = -2*alpha*q(i,j) This is also the derivative of the objective function + // w.r.t. p(i,j): i.e. p_deriv(i,j) = -2*alpha*q(i,j). + // Suppose we have define this matrix as 'P_deriv'. + // The derivative of the objective w.r.t M equals + // 2 * P_deriv * M, which equals -4*alpha*(P-scale^2*I)*M. + // (Currently the matrix P contains what, in the math, is P-scale^2*I). + M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, *M, kNoTrans, 0.0); + M->AddMat(1.0, M_update); +} + +/** + This function, to be called after processing every minibatch, is responsible + for enforcing the orthogonality constraint for any components of type + LinearComponent or inheriting from AffineComponent that have the + "orthonormal_constraint" value set. + */ +void ConstrainOrthonormal(Nnet *nnet) { + + for (int32 c = 0; c < nnet->NumComponents(); c++) { + Component *component = nnet->GetComponent(c); + LinearComponent *lc = dynamic_cast(component); + if (lc != NULL && lc->OrthonormalConstraint() != 0.0) { + if (RandInt(0, 3) != 0) + continue; // For efficiency, only do this every 4 minibatches-- it won't + // stray far. + BaseFloat scale = lc->OrthonormalConstraint(); + KALDI_ASSERT(scale > 0.0); + + CuMatrixBase ¶ms = lc->Params(); + int32 rows = params.NumRows(), cols = params.NumCols(); + if (rows <= cols) { + ConstrainOrthonormalInternal(scale, ¶ms); + } else { + CuMatrix params_trans(params, kTrans); + ConstrainOrthonormalInternal(scale, ¶ms_trans); + params.CopyFromMat(params_trans, kTrans); + } + } + + AffineComponent *ac = dynamic_cast(component); + if (ac != NULL && ac->OrthonormalConstraint() != 0.0) { + if (RandInt(0, 3) != 0) + continue; // For efficiency, only do this every 4 minibatches-- it won't + // stray far. + BaseFloat scale = ac->OrthonormalConstraint(); + KALDI_ASSERT(scale > 0.0); + CuMatrixBase ¶ms = ac->LinearParams(); + int32 rows = params.NumRows(), cols = params.NumCols(); + if (rows <= cols) { + ConstrainOrthonormalInternal(scale, ¶ms); + } else { + CuMatrix params_trans(params, kTrans); + ConstrainOrthonormalInternal(scale, ¶ms_trans); + params.CopyFromMat(params_trans, kTrans); + } + } + } +} + // This code has been broken out of ReadEditConfig as it's quite long. // It implements the internals of the edit directive 'reduce-rank'. diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index d961b7cb6a0..efa36e1f64c 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -168,8 +168,7 @@ std::string NnetInfo(const Nnet &nnet); void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet); -/// Returns true if nnet has at least one component of type -/// BatchNormComponent. +/// Returns true if nnet has at least one component of type BatchNormComponent. bool HasBatchnorm(const Nnet &nnet); /// This function affects only components of type BatchNormComponent. @@ -251,7 +250,6 @@ struct CollapseModelConfig { void CollapseModel(const CollapseModelConfig &config, Nnet *nnet); - /** ReadEditConfig() reads a file with a similar-looking format to the config file read by Nnet::ReadConfig(), but this consists of a sequence of operations to @@ -452,6 +450,18 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, Nnet *nnet); +/** + This function, to be called after processing every minibatch, is responsible + for enforcing the orthogonality constraint for any components of type + LinearComponent or inheriting from AffineComponent that have the + "orthonormal-constraint" value set to nonzero. + + In order to make it efficient on GPU, it doesn't make it completely orthonormal, + it just makes it closer to being orthonormal (times the 'orthonormal_constraint' + value). Over multiple iterations this rapidly makes it almost exactly orthonormal. + */ +void ConstrainOrthonormal(Nnet *nnet); + /** This utility function can be used to obtain the number of distinct 'n' values in a training example. This is the number of examples (e.g. sequences) that have been combined into a single example. (Actually diff --git a/src/nnet3bin/nnet3-show-progress.cc b/src/nnet3bin/nnet3-show-progress.cc index 7e937f0c211..25a65dbed5c 100644 --- a/src/nnet3bin/nnet3-show-progress.cc +++ b/src/nnet3bin/nnet3-show-progress.cc @@ -132,6 +132,10 @@ int main(int argc, char *argv[]) { { // Get info about magnitude of parameter change. Nnet diff_nnet(nnet1); AddNnet(nnet2, -1.0, &diff_nnet); + if (GetVerboseLevel() >= 1) { + KALDI_VLOG(1) << "Printing info for the difference between the neural nets: " + << diff_nnet.Info(); + } int32 num_updatable = NumUpdatableComponents(diff_nnet); Vector dot_prod(num_updatable); ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod); @@ -139,12 +143,15 @@ int main(int argc, char *argv[]) { KALDI_LOG << "Parameter differences per layer are " << PrintVectorPerUpdatableComponent(nnet1, dot_prod); - Vector baseline_prod(num_updatable); + Vector baseline_prod(num_updatable), + new_prod(num_updatable); ComponentDotProducts(nnet1, nnet1, &baseline_prod); + ComponentDotProducts(nnet2, nnet2, &new_prod); baseline_prod.ApplyPow(0.5); + new_prod.ApplyPow(0.5); - KALDI_LOG << "Norms of parameter matrices are " - << PrintVectorPerUpdatableComponent(nnet1, baseline_prod); + KALDI_LOG << "Norms of parameter matrices from are " + << PrintVectorPerUpdatableComponent(nnet2, new_prod); dot_prod.DivElements(baseline_prod); KALDI_LOG << "Relative parameter differences per layer are " diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc index f490f490f61..0e45fe665b5 100644 --- a/src/rnnlm/rnnlm-embedding-training.cc +++ b/src/rnnlm/rnnlm-embedding-training.cc @@ -77,12 +77,11 @@ void RnnlmEmbeddingTrainer::Train( if (l2_term != 0.0) { embedding_deriv->AddMat(l2_term, *embedding_mat_); } - } + } BaseFloat scale = 1.0; if (config_.use_natural_gradient) { - preconditioner_.PreconditionDirections(embedding_deriv, NULL, - &scale); + preconditioner_.PreconditionDirections(embedding_deriv, &scale); } scale *= config_.learning_rate; num_minibatches_++; @@ -130,11 +129,10 @@ void RnnlmEmbeddingTrainer::Train( if (l2_term != 0.0) { embedding_deriv->AddToRows(l2_term, active_words, embedding_mat_); } - } + } BaseFloat scale = 1.0; if (config_.use_natural_gradient) { - preconditioner_.PreconditionDirections(embedding_deriv, NULL, - &scale); + preconditioner_.PreconditionDirections(embedding_deriv, &scale); } scale *= config_.learning_rate; num_minibatches_++;