diff --git a/egs/cifar/v1/local/nnet3/compare.sh b/egs/cifar/v1/local/nnet3/compare.sh new file mode 100755 index 00000000000..c5208c38ac0 --- /dev/null +++ b/egs/cifar/v1/local/nnet3/compare.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# this script is used for comparing trained models between systems. +# e.g. local/nnet3/compare.sh exp/resnet1{b,c}_cifar10 + + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/resnet1{b,c}_cifar10" + exit 1 +fi + +echo "# $0 $*" + + + +echo -n "# System " +for x in $*; do printf "% 12s" " $(basename $x)"; done +echo + + +echo -n "# final test accuracy: " +for x in $*; do + acc=$(grep acc $x/log/compute_prob_valid.final.log | awk '{print $8}') + printf "% 12s" $acc +done + +echo +echo -n "# final train accuracy: " +for x in $*; do + acc=$(grep acc $x/log/compute_prob_train.final.log | awk '{print $8}') + printf "% 12s" $acc +done + +echo +echo -n "# final test objf: " +for x in $*; do + objf=$(grep log-like $x/log/compute_prob_valid.final.log | awk '{print $8}') + printf "% 12s" $objf +done + +echo +echo -n "# final train objf: " +for x in $*; do + objf=$(grep log-like $x/log/compute_prob_train.final.log | awk '{print $8}') + printf "% 12s" $objf +done + +echo +echo -n "# num-parameters: " +for x in $*; do + params=$(grep num-parameters $x/log/progress.1.log | awk '{print $2}') + printf "% 12s" $params +done + +echo diff --git a/egs/cifar/v1/local/nnet3/run_cnn_aug_1b.sh b/egs/cifar/v1/local/nnet3/run_cnn_aug_1b.sh index f31ad7601a9..8e5f83ea2d5 100755 --- a/egs/cifar/v1/local/nnet3/run_cnn_aug_1b.sh +++ b/egs/cifar/v1/local/nnet3/run_cnn_aug_1b.sh @@ -1,10 +1,14 @@ #!/bin/bash -# aug_1b is the same as 1e but with data augmentation -# accuracy 84.5% (1e has accuracy 83%) +# run_cnn_aug_1b is the same as run_cnn_1e but with data augmentation. + +# accuracy is 0.857, vs. 0.83 for the un-augmented baseline. + +# exp/cnn_aug_1b_cifar10: num-iters=60 nj=1..2 num-params=2.2M dim=96->10 combine=-0.40->-0.38 loglike:train/valid[39,59,final]=(-0.35,-0.26,-0.26/-0.47,-0.42,-0.42) accuracy:train/valid[39,59,final]=(0.88,0.91,0.91/0.84,0.86,0.86) + +# grep Overall exp/cnn_aug_1b_cifar10/log/compute_prob_valid.final.log | grep acc +# LOG (nnet3-compute-prob[5.1]:PrintTotalStats():nnet-diagnostics.cc:165) Overall accuracy for 'output' is 0.8567 per frame, over 10000 frames.# -# steps/info/nnet3_dir_info.pl exp/cnn_aug_1b_cifar10 -# exp/cnn_aug_1b_cifar10/: num-iters=60 nj=1..2 num-params=0.2M dim=96->10 combine=-0.53->-0.50 loglike:train/valid[39,59,final]=(-0.57,-0.45,-0.48/-0.68,-0.62,-0.64) accuracy:train/valid[39,59,final]=(0.80,0.84,0.83/0.76,0.79,0.78) # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -17,7 +21,7 @@ train_stage=-10 dataset=cifar10 srand=0 reporting_email= -affix=_aug_1e +affix=_aug_1b # End configuration section. @@ -93,7 +97,7 @@ if [ $stage -le 2 ]; then steps/nnet3/train_raw_dnn.py --stage=$train_stage \ --cmd="$train_cmd" \ - --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1" \ + --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1 --num-channels=3" \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=30 \ diff --git a/egs/cifar/v1/local/nnet3/run_cnn_aug_1c.sh b/egs/cifar/v1/local/nnet3/run_cnn_aug_1c.sh index 23c801290a3..184ea0fa306 100755 --- a/egs/cifar/v1/local/nnet3/run_cnn_aug_1c.sh +++ b/egs/cifar/v1/local/nnet3/run_cnn_aug_1c.sh @@ -5,7 +5,7 @@ # accuracy improved from 85.8% to 88% # steps/info/nnet3_dir_info.pl exp/cnn_aug_1c_cifar10/ -# exp/cnn_aug_1c_cifar10/: num-iters=200 nj=1..2 num-params=2.2M dim=96->10 combine=-0.24->-0.24 loglike:train/valid[132,199,final]=(-0.18,-0.12,-0.12/-0.39,-0.37,-0.37) accuracy:train/valid[132,199,final]=(0.94,0.96,0.96/0.87,0.88,0.88) +# exp/cnn_aug_1c_cifar10: num-iters=200 nj=1..2 num-params=2.2M dim=96->10 combine=-0.23->-0.24 loglike:train/valid[132,199,final]=(-0.17,-0.12,-0.12/-0.39,-0.36,-0.37) accuracy:train/valid[132,199,final]=(0.94,0.96,0.96/0.87,0.88,0.88) # Set -e here so that we catch if any executable fails immediately set -euo pipefail diff --git a/egs/cifar/v1/local/nnet3/run_resnet_1a.sh b/egs/cifar/v1/local/nnet3/run_resnet_1a.sh new file mode 100755 index 00000000000..8f41bb96c07 --- /dev/null +++ b/egs/cifar/v1/local/nnet3/run_resnet_1a.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +# run_resnet_1a.sh is a quite well-performing resnet. +# It includes a form of shrinkage that approximates l2 regularization. +# (c.f. --proportional-shrink). + +# Definitely better: + +# local/nnet3/compare.sh exp/resnet1a_cifar10 +# System resnet1a_cifar10 +# final test accuracy: 0.9481 +# final train accuracy: 0.9992 +# final test objf: -0.171369 +# final train objf: -0.00980603 +# num-parameters: 1322730 + +# local/nnet3/compare.sh exp/resnet1a_cifar100 +# System resnet1a_cifar100 +# final test accuracy: 0.7478 +# final train accuracy: 0.9446 +# final test objf: -0.899789 +# final train objf: -0.22468 +# num-parameters: 1345860 + + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + + + +# training options +stage=0 +train_stage=-10 +dataset=cifar10 +srand=0 +reporting_email= +affix=1a + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=96 name=input + conv-layer name=conv1 height-in=32 height-out=32 time-offsets=-1,0,1 required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=$nf1 + res-block name=res2 num-filters=$nf1 height=32 time-period=1 $res_opts + res-block name=res3 num-filters=$nf1 height=32 time-period=1 $res_opts + conv-layer name=conv4 height-in=32 height-out=16 height-subsample-out=2 time-offsets=-1,0,1 $common num-filters-out=$nf2 + res-block name=res5 num-filters=$nf2 height=16 time-period=2 $res_opts + res-block name=res6 num-filters=$nf2 height=16 time-period=2 $res_opts + conv-layer name=conv7 height-in=16 height-out=8 height-subsample-out=2 time-offsets=-2,0,2 $common num-filters-out=$nf3 + res-block name=res8 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts + res-block name=res9 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts + res-block name=res10 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts + channel-average-layer name=channel-average input=Append(2,6,10,14,18,22,24,28) dim=$nf3 + output-layer name=output learning-rate-factor=0.1 dim=$num_targets +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 2 ]; then + + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1 --num-channels=3" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=60 \ + --egs.frames-per-eg=1 \ + --trainer.optimization.num-jobs-initial=1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate=0.003 \ + --trainer.optimization.final-effective-lrate=0.0003 \ + --trainer.optimization.minibatch-size=256,128,64 \ + --trainer.optimization.proportional-shrink=50.0 \ + --trainer.shuffle-buffer-size=2000 \ + --egs.dir="$egs" \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + + +exit 0; diff --git a/egs/cifar/v1/local/nnet3/run_resnet_1b.sh b/egs/cifar/v1/local/nnet3/run_resnet_1b.sh new file mode 100755 index 00000000000..f8f3b563e6c --- /dev/null +++ b/egs/cifar/v1/local/nnet3/run_resnet_1b.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +# 1b is as 1a but using more epochs: 100 instead of 60. +# This helps a bit. + +#exp/resnet1b_cifar10: num-iters=133 nj=1..2 num-params=1.3M dim=96->10 combine=-0.01->-0.01 loglike:train/valid[87,132,final]=(-0.13,-0.03,-0.01/-0.27,-0.21,-0.16) accuracy:train/valid[87,132,final]=(0.95,0.99,1.00/0.91,0.94,0.95) +#exp/resnet1b_cifar100: num-iters=133 nj=1..2 num-params=1.3M dim=96->100 combine=-0.22->-0.19 loglike:train/valid[87,132,final]=(-0.75,-0.27,-0.16/-1.22,-1.06,-0.89) accuracy:train/valid[87,132,final]=(0.78,0.93,0.96/0.67,0.72,0.76) + + +# local/nnet3/compare.sh exp/resnet1a_cifar10 exp/resnet1b_cifar10 +# System resnet1a_cifar10 resnet1b_cifar10 +# final test accuracy: 0.9481 0.9521 +# final train accuracy: 0.9992 0.9998 +# final test objf: -0.171369 -0.160283 +# final train objf: -0.00980603 -0.00672504 +# num-parameters: 1322730 1322730 + +# local/nnet3/compare.sh exp/resnet1a_cifar100 exp/resnet1b_cifar100 +# System resnet1a_cifar100 resnet1b_cifar100 +# final test accuracy: 0.7478 0.7597 +# final train accuracy: 0.9446 0.9638 +# final test objf: -0.899789 -0.889707 +# final train objf: -0.22468 -0.163996 +# num-parameters: 1345860 1345860 + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + + + +# training options +stage=0 +train_stage=-10 +dataset=cifar10 +srand=0 +reporting_email= +affix=1b + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=96 name=input + conv-layer name=conv1 height-in=32 height-out=32 time-offsets=-1,0,1 required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=$nf1 + res-block name=res2 num-filters=$nf1 height=32 time-period=1 $res_opts + res-block name=res3 num-filters=$nf1 height=32 time-period=1 $res_opts + conv-layer name=conv4 height-in=32 height-out=16 height-subsample-out=2 time-offsets=-1,0,1 $common num-filters-out=$nf2 + res-block name=res5 num-filters=$nf2 height=16 time-period=2 $res_opts + res-block name=res6 num-filters=$nf2 height=16 time-period=2 $res_opts + conv-layer name=conv7 height-in=16 height-out=8 height-subsample-out=2 time-offsets=-2,0,2 $common num-filters-out=$nf3 + res-block name=res8 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts + res-block name=res9 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts + res-block name=res10 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts + channel-average-layer name=channel-average input=Append(2,6,10,14,18,22,24,28) dim=$nf3 + output-layer name=output learning-rate-factor=0.1 dim=$num_targets +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 2 ]; then + + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1 --num-channels=3" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=100 \ + --egs.frames-per-eg=1 \ + --trainer.optimization.num-jobs-initial=1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate=0.003 \ + --trainer.optimization.final-effective-lrate=0.0003 \ + --trainer.optimization.minibatch-size=256,128,64 \ + --trainer.optimization.proportional-shrink=50.0 \ + --trainer.shuffle-buffer-size=2000 \ + --egs.dir="$egs" \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + + +exit 0; diff --git a/egs/cifar/v1/run.sh b/egs/cifar/v1/run.sh index c4760672169..084a8a53041 100755 --- a/egs/cifar/v1/run.sh +++ b/egs/cifar/v1/run.sh @@ -17,6 +17,10 @@ fi # cifar10 egs preparation image/nnet3/get_egs.sh --cmd "$train_cmd" data/cifar10_train data/cifar10_test exp/cifar10_egs - # cifar100 egs preparation image/nnet3/get_egs.sh --cmd "$train_cmd" data/cifar100_train data/cifar100_test exp/cifar100_egs + + +# prepare a different version of the egs with 2 instead of 3 archives. +image/nnet3/get_egs.sh --egs-per-archive 30000 --cmd "$train_cmd" data/cifar10_train data/cifar10_test exp/cifar10_egs2 +image/nnet3/get_egs.sh --egs-per-archive 30000 --cmd "$train_cmd" data/cifar100_train data/cifar100_test exp/cifar100_egs2 diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 4a8505d4f3a..fb62d579510 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -260,7 +260,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, do_average = (iter > 0) raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " - "{1}/{2}.mdl - |".format(learning_rate, dir, iter)) + "--scale={1} {2}/{3}.mdl - |".format( + learning_rate, shrinkage_value, dir, iter)) if do_average: cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str @@ -315,16 +316,14 @@ def train_one_iteration(dir, iter, srand, egs_dir, common_train_lib.get_average_nnet_model( dir=dir, iter=iter, nnets_list=" ".join(nnets_list), - run_opts=run_opts, - shrink=shrinkage_value) + run_opts=run_opts) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( dir=dir, iter=iter, best_model_index=best_model, - run_opts=run_opts, - shrink=shrinkage_value) + run_opts=run_opts) try: for i in range(1, num_jobs + 1): diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 6c5e0d6d834..e18c43645ae 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -78,26 +78,17 @@ def get_successful_models(num_models, log_file_pattern, def get_average_nnet_model(dir, iter, nnets_list, run_opts, - get_raw_nnet_from_am=True, shrink=None): - scale = 1.0 - if shrink is not None: - scale = shrink + get_raw_nnet_from_am=True): next_iter = iter + 1 if get_raw_nnet_from_am: - out_model = ("""- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \ + out_model = ("""- \| nnet3-am-copy --set-raw-nnet=- \ {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format( dir=dir, iter=iter, - next_iter=next_iter, - scale=scale)) + next_iter=next_iter)) else: - if shrink is not None: - out_model = """- \| nnet3-copy --scale={scale} \ - - {dir}/{next_iter}.raw""".format( - dir=dir, next_iter=next_iter, scale=scale) - else: - out_model = "{dir}/{next_iter}.raw".format(dir=dir, - next_iter=next_iter) + out_model = "{dir}/{next_iter}.raw".format( + dir=dir, next_iter=next_iter) common_lib.execute_command( """{command} {dir}/log/average.{iter}.log \ @@ -110,10 +101,7 @@ def get_average_nnet_model(dir, iter, nnets_list, run_opts, def get_best_nnet_model(dir, iter, best_model_index, run_opts, - get_raw_nnet_from_am=True, shrink=None): - scale = 1.0 - if shrink is not None: - scale = shrink + get_raw_nnet_from_am=True): best_model = "{dir}/{next_iter}.{best_model_index}.raw".format( dir=dir, @@ -130,11 +118,11 @@ def get_best_nnet_model(dir, iter, best_model_index, run_opts, common_lib.execute_command( """{command} {dir}/log/select.{iter}.log \ - nnet3-copy --scale={scale} {best_model} \ + nnet3-copy {best_model} \ {out_model}""".format(command=run_opts.command, dir=dir, iter=iter, best_model=best_model, - out_model=out_model, scale=scale)) + out_model=out_model)) def validate_chunk_width(chunk_width): @@ -530,8 +518,8 @@ def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed, return num_jobs * effective_learning_rate -def do_shrinkage(iter, model_file, shrink_saturation_threshold, - get_raw_nnet_from_am=True): +def should_do_shrinkage(iter, model_file, shrink_saturation_threshold, + get_raw_nnet_from_am=True): if iter == 0: return True diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index 010320d9170..319687aa4c0 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -77,8 +77,9 @@ def train_new_models(dir, iter, srand, num_jobs, if image_augmentation_opts: image_augmentation_cmd = ( - 'nnet3-egs-augment-image {aug_opts} ark:- ark:- |'.format( - aug_opts=image_augmentation_opts)) + 'nnet3-egs-augment-image --srand={srand} {aug_opts} ark:- ark:- |'.format( + srand=k+srand, + aug_opts=image_augmentation_opts)) else: image_augmentation_cmd = '' @@ -95,8 +96,7 @@ def train_new_models(dir, iter, srand, num_jobs, """nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} """ """--srand={srand} ark:- ark:- | {aug_cmd} """ """nnet3-merge-egs --minibatch-size={minibatch_size_str} """ - """ark:- ark:- |" \ - {dir}/{next_iter}.{job}.raw""".format( + """ ark:- ark:- |" {dir}/{next_iter}.{job}.raw""".format( command=run_opts.command, train_queue_opt=run_opts.train_queue_opt, dir=dir, iter=iter, srand=iter + srand, @@ -185,12 +185,14 @@ def train_one_iteration(dir, iter, srand, egs_dir, if get_raw_nnet_from_am: raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " - "{1}/{2}.mdl - |".format(learning_rate, - dir, iter)) + "--scale={1} {2}/{3}.mdl - |".format( + learning_rate, shrinkage_value, + dir, iter)) else: - raw_model_string = ("nnet3-copy --learning-rate={lr} " + raw_model_string = ("nnet3-copy --learning-rate={lr} --scale={s} " "{dir}/{iter}.raw - |".format( - lr=learning_rate, dir=dir, iter=iter)) + lr=learning_rate, s=shrinkage_value, + dir=dir, iter=iter)) raw_model_string = raw_model_string + dropout_edit_string @@ -240,8 +242,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts, - get_raw_nnet_from_am=get_raw_nnet_from_am, - shrink=shrinkage_value) + get_raw_nnet_from_am=get_raw_nnet_from_am) else: # choose the best model from different jobs @@ -249,8 +250,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts, - get_raw_nnet_from_am=get_raw_nnet_from_am, - shrink=shrinkage_value) + get_raw_nnet_from_am=get_raw_nnet_from_am) try: for i in range(1, num_jobs + 1): diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 7c01689e86c..d5c1bc39eaf 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -628,7 +628,8 @@ def __init__(self, first_token, key_to_value, prev_names = None): # Here we just list some likely combinations.. you can just add any # combinations you want to use, to this list. assert first_token in [ 'relu-layer', 'relu-renorm-layer', 'sigmoid-layer', - 'tanh-layer', 'relu-batchnorm-layer', 'relu-dropout-layer' ] + 'tanh-layer', 'relu-batchnorm-layer', 'relu-dropout-layer', + 'relu-batchnorm-dropout-layer' ] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py index e14aca92b3b..12f4979c39a 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py @@ -209,7 +209,7 @@ def output_name(self, auxiliary_output = None): assert auxiliary_output is None # note: the [:-1] is to remove the '-layer'. operations = self.layer_type.split('-')[:-1] - assert len(operations) > 1 + assert len(operations) >= 1 last_operation = operations[-1] assert last_operation in ['relu', 'conv', 'renorm', 'batchnorm', 'dropout'] @@ -264,7 +264,6 @@ def generate_cnn_config(self): a.append('{0}={1}'.format(opt_name, value)) conv_opts = ' '.join(a) - configs.append("### Begin convolutional layer '{0}'".format(name)) configs.append('component name={0}.conv type=TimeHeightConvolutionComponent ' '{1}'.format(name, conv_opts)) configs.append('component-node name={0}.conv component={0}.conv ' @@ -305,3 +304,434 @@ def generate_cnn_config(self): cur_descriptor = '{0}.{1}'.format(name, operation) return configs + + +# This class is for lines like the following: +# +# res-block name=res1 num-filters=64 height=32 time-period=1 +# +# It implements a residual block as in ResNets, but with some small differences +# that make it a little more general-- basically, instead of adding the input to +# the output, we put a convolutional layer in there but initialize it to the +# unit matrix and if you want you can give it a relatively small (or even zero) +# learning rate and max-change. And there is batch-norm in that path also. +# +# The number of filters is the same on the input and output; it is actually +# redundant to write it in the config file, because given that we know the +# height, we can work it out from the dimension of the input (as dimension = +# height * num-filters). But we allow it to be specified anyway, for clarity. +# +# Note: the res-block does not support subsampling or changing the number of +# filters. If you want to do that, we recommend that you should do it with a +# single relu-batchnorm-conv-layer. +# +# Here are the most important configuration values, with defaults shown if +# defaults exist: +# +# input='[-1]' Descriptor giving the input of the layer. +# height The input and output height of the image, e.g. 40. Note: the width +# is associated with the time dimension and is dealt with +# implicitly, so it's not specified here. +# num-filters The number of filters on the input and output, e.g. 64. +# It does not have to be specified; if it is not specified, +# we work it out from the input dimension. +# num-bottleneck-filters If specified then this will be a 'bottleneck' +# ResBlock, in which there is a 1x1 convolution from +# num-filters->num-bottleneck-filters, a 3x3 convolution +# from num-bottleneck-filters->num-bottleneck-filters, and +# a 1x1 convolution from num-bottleneck-filters->num-filters. +# +# time-period=1 Think of this as the stride in the time dimension. At the +# input of the network will always have time-period=1; then +# after subsampling once in time we'd have time-period=2; then +# after subsampling again we'd have time-period=4. Because of +# the way nnet3 works, subsampling on the time axis is an +# implicit, not explicit, operation. +# bypass-source=noop +# The output of this component is Sum(convolution, x), and +# this option controls what 'x' is. There are 3 options +# here: 'noop', 'input', 'relu' or 'batchnorm'. 'noop' is +# equivalent to 'input' in what it computes; it just +# inserts a 'noop' component in order to make the +# computation more efficient. For both 'noop' and +# 'input', x is the input to this component. If +# bypass-source=relu then we use the relu of the +# input; if 'batchnorm', then we use the relu+batchnorm of +# the input. +# allow-zero-padding=true By default this will allow zero-padding in the time +# dimension, meaning that you don't need extra frames at +# the input to compute the output. There may be ASR +# applications where you want to pad in the time dimension +# with repeats of the first or last frame (as we do for +# TDNNs), where it would be appropriate to write +# allow-zero-padding=false. Note: the way we have +# set it up, it does zero-padding on the height axis +# regardless +# +# Less important config variables: +# self-repair-scale=2.0e-05 This affects the ReLu's. It is a scale on the +# 'self-repair' mechanism that nudges the inputs to the +# ReLUs into the appropriate range in cases where +# the unit is active either too little of the time +# (<10%) or too much of the time (>90%). +# max-change=0.75 Max-parameter-change constant (per minibatch) +# used for convolutional components. +# +# +# The following natural-gradient-related configuration variables are passed in +# to the convolution components, if specified: +# use-natural-gradient (bool) +# rank-in, rank-out (int) +# num-minibatches-history (float) +# alpha-in, alpha-out (float) + +class XconfigResBlock(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == 'res-block' + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input':'[-1]', + 'height':-1, + 'num-filters':-1, + 'num-bottleneck-filters':-1, + 'time-period':1, + 'self-repair-scale': 2.0e-05, + 'max-change': 0.75, + 'allow-zero-padding': True, + 'bypass-source' : 'noop', + # the following are not really inspected by this level of + # code, just passed through (but not if left at ''). + 'param-stddev':'', 'bias-stddev':'', + 'use-natural-gradient':'', + 'rank-in':'', 'rank-out':'', + 'num-minibatches-history':'', + 'alpha-in':'', 'alpha-out':''} + + def set_derived_configs(self): + # set 'num-filters' or check it.. + input_dim = self.descriptors['input']['dim'] + height = self.config['height'] + + cur_num_filters = self.config['num-filters'] + if cur_num_filters == -1: + if input_dim % height != 0: + raise RuntimeError("Specified image height {0} does not " + "divide the input dim {1}".format( + height, input_dim)) + self.config['num-filters'] = input_dim / height + elif input_dim != cur_num_filters * height: + raise RuntimeError("Expected the input-dim to equal " + "height={0} * num-filters={1} = {2}, but " + "it is {3}".format( + height, cur_num_filters, + height * cur_num_filters, + input_dim)); + + def check_configs(self): + # we checked the dimensions in set_derived_configs. + if not self.config['bypass-source'] in [ + 'input', 'noop', 'relu', 'batchnorm' ]: + raise RuntimeError("Expected direct-convolution-source to " + "be input, relu or batchnorm, got: {1}".format( + self.config['direct-convolution-source'])) + + def auxiliary_outputs(self): + return [] + + def output_name(self, auxiliary_output = None): + bypass_source = self.config['bypass-source'] + b = self.config['num-bottleneck-filters'] + conv = ('{0}.conv2' if b <= 0 else '{0}.conv3').format(self.name) + if bypass_source == 'input': + residual = self.descriptors['input']['final-string'] + elif bypass_source == 'noop': + # we let the noop be the sum of the convolutional part and the + # input, so just return the output of the no-op component. + return '{0}.noop'.format(self.name) + elif bypass_source == 'relu': + residual = '{0}.relu1'.format(self.name) + else: + assert bypass_source == 'batchnorm' + residual = '{0}.batchnorm1'.format(self.name) + + return 'Sum({0}, {1})'.format(conv, residual) + + def output_dim(self, auxiliary_output = None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + b = self.config['num-bottleneck-filters'] + if b <= 0: + config_lines = self.generate_normal_resblock_config() + else: + config_lines = self.generate_bottleneck_resblock_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in CNN initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + # generate_normal_resblock_config is a convenience function to generate the + # res-block config (the non-bottleck version). + # + # The main path inside the res-block in the non-bottleneck case is as + # follows: + # + # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 + # + # We put the relu before the batchnorm because we think it makes more sense; + # because the Torch people seemed to find that this works better + # (https://github.com/gcr/torch-residual-networks/issues/5); + # and because in our batchnorm component we haven't implemented the beta and + # gamma; these would be essential to having it work before relu, but + # when before a convolution or linear component, they add no extra modeling + # power. + # + # The output of the res-block can be the sum of the last convolutional + # component (conv2), with the input. However, the option ('bypass-source') + # controls whether we sum with the raw input, or its relu or relu+batchnorm. + # If the term is going to be the raw input, we give the option ('noop') and + # to cache the output sum via a NoOpComponent)-- because due to how nnet3 + # works, if we didn't do this, redundant summing operations would take + # place. + def generate_normal_resblock_config(self): + configs = [] + + name = self.name + num_filters = self.config['num-filters'] + assert self.config['num-bottleneck-filters'] == -1 + height = self.config['height'] + input_descriptor = self.descriptors['input']['final-string'] + allow_zero_padding = self.config['allow-zero-padding'] + time_period = self.config['time-period'] + + # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 + cur_descriptor = input_descriptor + for n in [1, 2]: + # the ReLU + configs.append('component name={0}.relu{1} type=RectifiedLinearComponent ' + 'dim={2} self-repair-scale={3}'.format( + name, n, num_filters * height, + self.config['self-repair-scale'])) + configs.append('component-node name={0}.relu{1} component={0}.relu{1} ' + 'input={2}'.format(name, n, cur_descriptor)) + + cur_descriptor = '{0}.relu{1}'.format(name, n) + + # the batch-norm + configs.append('component name={0}.batchnorm{1} type=BatchNormComponent dim={2} ' + 'block-dim={3}'.format( + name, n, num_filters * height, + num_filters)) + configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} ' + 'input={2}'.format(name, n, cur_descriptor)) + cur_descriptor = '{0}.batchnorm{1}'.format(name, n) + + + # the convolution. + a = [] + for opt_name in [ + 'param-stddev', 'bias-stddev', 'use-natural-gradient', + 'max-change', 'rank-in', 'rank-out', 'num-minibatches-history', + 'alpha-in', 'alpha-out' ]: + value = self.config[opt_name] + if value != '': + a.append('{0}={1}'.format(opt_name, value)) + conv_opts = ('height-in={h} height-out={h} height-offsets=-1,0,1 time-offsets=-{p},0,{p} ' + 'num-filters-in={f} num-filters-out={f} {r} {o}'.format( + h=height, p=time_period, f=num_filters, + r=('required-time-offsets=0' if allow_zero_padding else ''), + o=' '.join(a))) + + configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent ' + '{2}'.format(name, n, conv_opts)) + configs.append('component-node name={0}.conv{1} component={0}.conv{1} ' + 'input={2}'.format(name, n, cur_descriptor)) + cur_descriptor = '{0}.conv{1}'.format(name, n) + + + + if self.config['bypass-source'] == 'noop': + dim = self.descriptors['input']['dim'] + configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format( + name, dim)) + configs.append('component-node name={0}.noop component={0}.noop ' + 'input=Sum({1}, {0}.conv2)'.format(name, + input_descriptor)) + + # Note: the function 'output_name' is responsible for returning the + # descriptor corresponding to the output of the network. + return configs + + + + # generate_bottleneck_resblock_config is a convenience function to generate the + # res-block config (this is the bottleneck version, where there is + # a 3x3 kernel with a smaller number of filters than at the input and output, + # sandwiched between two 1x1 kernels. + # + # The main path inside the res-block in the bottleneck case is as follows: + # + # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 -> + # relu3 -> batchnorm3 -> conv3 + # + # power. + # + # The output of the res-block can be the sum of the last convolutional + # component (conv3), with the input. However we give the option + # ('bypass-source') to sum with the raw input, or its relu or + # relu+batchnorm. If the term is going to be the raw input, we give the + # option ('noop') and to cache the output sum via a NoOpComponent)-- because + # due to how nnet3 works, if we didn't do this, redundant summing operations + # would take place. + def generate_bottleneck_resblock_config(self): + configs = [] + + name = self.name + num_filters = self.config['num-filters'] + num_bottleneck_filters = self.config['num-bottleneck-filters'] + assert num_bottleneck_filters > 0 + height = self.config['height'] + input_descriptor = self.descriptors['input']['final-string'] + allow_zero_padding = self.config['allow-zero-padding'] + time_period = self.config['time-period'] + + # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 + cur_descriptor = input_descriptor + cur_num_filters = num_filters + + for n in [1, 2, 3]: + # the ReLU + configs.append('component name={0}.relu{1} type=RectifiedLinearComponent ' + 'dim={2} self-repair-scale={3}'.format( + name, n, cur_num_filters * height, + self.config['self-repair-scale'])) + configs.append('component-node name={0}.relu{1} component={0}.relu{1} ' + 'input={2}'.format(name, n, cur_descriptor)) + + cur_descriptor = '{0}.relu{1}'.format(name, n) + + # the batch-norm + configs.append('component name={0}.batchnorm{1} type=BatchNormComponent dim={2} ' + 'block-dim={3}'.format( + name, n, cur_num_filters * height, + cur_num_filters)) + configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} ' + 'input={2}'.format(name, n, cur_descriptor)) + cur_descriptor = '{0}.batchnorm{1}'.format(name, n) + + + # the convolution. + a = [] + for opt_name in [ + 'param-stddev', 'bias-stddev', 'use-natural-gradient', + 'max-change', 'rank-in', 'rank-out', 'num-minibatches-history', + 'alpha-in', 'alpha-out' ]: + value = self.config[opt_name] + if value != '': + a.append('{0}={1}'.format(opt_name, value)) + + height_offsets = ('-1,0,1' if n == 2 else '0') + time_offsets = ('-{t},0,{t}'.format(t=time_period) if n == 2 else '0') + num_filters_in = cur_num_filters + num_filters_out = (num_filters if n == 3 else num_bottleneck_filters) + cur_num_filters = num_filters_out + + conv_opts = ('height-in={h} height-out={h} height-offsets={ho} time-offsets={to} ' + 'num-filters-in={fi} num-filters-out={fo} {r} {o}'.format( + h=height, ho=height_offsets, to=time_offsets, + fi=num_filters_in, fo=num_filters_out, + r=('required-time-offsets=0' if allow_zero_padding else ''), + o=' '.join(a))) + + configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent ' + '{2}'.format(name, n, conv_opts)) + configs.append('component-node name={0}.conv{1} component={0}.conv{1} ' + 'input={2}'.format(name, n, cur_descriptor)) + cur_descriptor = '{0}.conv{1}'.format(name, n) + + + + if self.config['bypass-source'] == 'noop': + dim = self.descriptors['input']['dim'] + configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format( + name, dim)) + configs.append('component-node name={0}.noop component={0}.noop ' + 'input=Sum({1}, {0}.conv3)'.format(name, + input_descriptor)) + + # Note: the function 'output_name' is responsible for returning the + # descriptor corresponding to the output of the network. + return configs + + +# This layer just maps to a single component, a SumBlockComponent. It's for +# doing channel averaging at the end of neural networks. See scripts for +# examples of how to use it. +# An example line using this layer is: +# channel-average-layer name=channel-average input=Append(2, 4, 6, 8) dim=64 + +# the configuration value 'dim' is the output dimension of this layer. +# The input dimension is expected to be a multiple of 'dim'. The output +# will be the average of 'dim'-sized blocks of the input. +class ChannelAverageLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "channel-average-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input':'[-1]', + 'dim': -1 } + + def set_derived_configs(self): + pass + + def check_configs(self): + input_dim = self.descriptors['input']['dim'] + dim = self.config['dim'] + if dim <= 0: + raise RuntimeError("dim must be specified and > 0.") + if input_dim % dim != 0: + raise RuntimeError("input-dim={0} is not a multiple of dim={1}".format( + input_dim, dim)) + + def auxiliary_outputs(self): + return [] + + def output_name(self, auxiliary_output = None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output = None): + assert auxiliary_output is None + return self.config['dim'] + + + def get_full_config(self): + ans = [] + config_lines = self.generate_channel_average_config() + for line in config_lines: + for config_name in ['ref', 'final']: + ans.append((config_name, line)) + return ans + + def generate_channel_average_config(self): + configs = [] + name = self.name + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + dim = self.config['dim'] + # choose the scale that makes it an average rather than a sum. + scale = dim * 1.0 / input_dim + configs.append('component name={0} type=SumBlockComponent input-dim={1} ' + 'output-dim={2} scale={3}'.format(name, input_dim, + dim, scale)) + configs.append('component-node name={0} component={0} input={1}'.format( + name, input_descriptor)) + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 0ab4a5e5f63..a7d5ece6ce9 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -21,6 +21,8 @@ 'output-layer' : xlayers.XconfigOutputLayer, 'relu-layer' : xlayers.XconfigBasicLayer, 'relu-renorm-layer' : xlayers.XconfigBasicLayer, + 'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer, + 'relu-dropout-layer': xlayers.XconfigBasicLayer, 'relu-batchnorm-layer' : xlayers.XconfigBasicLayer, 'sigmoid-layer' : xlayers.XconfigBasicLayer, 'tanh-layer' : xlayers.XconfigBasicLayer, @@ -32,15 +34,17 @@ 'fast-lstm-layer' : xlayers.XconfigFastLstmLayer, 'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer, 'relu-conv-layer': xlayers.XconfigConvLayer, + 'conv-layer': xlayers.XconfigConvLayer, 'conv-relu-layer': xlayers.XconfigConvLayer, 'relu-conv-renorm-layer': xlayers.XconfigConvLayer, 'conv-relu-renorm-layer': xlayers.XconfigConvLayer, - 'relu-conv-batchnorm-layer': xlayers.XconfigConvLayer, + 'batchnorm-conv-relu-layer': xlayers.XconfigConvLayer, + 'relu-batchnorm-conv-layer': xlayers.XconfigConvLayer, 'conv-relu-batchnorm-layer': xlayers.XconfigConvLayer, 'conv-relu-batchnorm-dropout-layer': xlayers.XconfigConvLayer, 'conv-relu-dropout-layer': xlayers.XconfigConvLayer, - 'relu-dropout-layer': xlayers.XconfigBasicLayer - + 'res-block': xlayers.XconfigResBlock, + 'channel-average-layer': xlayers.ChannelAverageLayer } # Turn a config line and a list of previous layers into diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 6bc51dcbd3f..59185235ba1 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -435,7 +435,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): shrinkage_value = 1.0 if args.shrink_value != 1.0: shrinkage_value = (args.shrink_value - if common_train_lib.do_shrinkage( + if common_train_lib.should_do_shrinkage( iter, model_file, args.shrink_saturation_threshold) else 1 diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py index 6c7123f7fa6..0fd0cc04d48 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -80,6 +80,18 @@ def get_args(): rule as accepted by the --minibatch-size option of nnet3-merge-egs; run that program without args to see the format.""") + parser.add_argument("--trainer.optimization.proportional-shrink", type=float, + dest='proportional_shrink', default=0.0, + help="""If nonzero, this will set a shrinkage (scaling) + factor for the parameters, whose value is set as: + shrink-value=(1.0 - proportional-shrink * learning-rate), where + 'learning-rate' is the learning rate being applied + on the current iteration, which will vary from + initial-effective-lrate*num-jobs-initial to + final-effective-lrate*num-jobs-final. + Unlike for train_rnn.py, this is applied unconditionally, + it does not depend on saturation of nonlinearities. + Can be used to roughly approximate l2 regularization.""") # General options parser.add_argument("--nj", type=int, default=4, @@ -320,6 +332,17 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + lrate = learning_rate(iter, current_num_jobs, + num_archives_processed) + shrink_value = 1.0 + if args.proportional_shrink != 0.0: + shrink_value = 1.0 - (args.proportional_shrink * lrate) + if shrink_value <= 0.5: + raise Exception("proportional-shrink={0} is too large, it gives " + "shrink-value={1}".format(args.proportional_shrink, + shrink_value)) + + if args.stage <= iter: train_lib.common.train_one_iteration( dir=args.dir, @@ -329,8 +352,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): num_jobs=current_num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, - learning_rate=learning_rate(iter, current_num_jobs, - num_archives_processed), + learning_rate=lrate, dropout_edit_string=common_train_lib.get_dropout_edit_string( args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, @@ -339,6 +361,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): frames_per_eg=args.frames_per_eg, momentum=args.momentum, max_param_change=args.max_param_change, + shrinkage_value=shrink_value, shuffle_buffer_size=args.shuffle_buffer_size, run_opts=run_opts, get_raw_nnet_from_am=False, diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py index 60d1c7fd5fe..812be8b95f3 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py @@ -386,7 +386,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): shrinkage_value = 1.0 if args.shrink_value != 1.0: shrinkage_value = (args.shrink_value - if common_train_lib.do_shrinkage( + if common_train_lib.should_do_shrinkage( iter, model_file, args.shrink_saturation_threshold, get_raw_nnet_from_am=False) diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index e8c044d679a..8405244a7ae 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -382,7 +382,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed): shrinkage_value = 1.0 if args.shrink_value != 1.0: shrinkage_value = (args.shrink_value - if common_train_lib.do_shrinkage( + if common_train_lib.should_do_shrinkage( iter, model_file, args.shrink_saturation_threshold) else 1 diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 5b72a62e716..3b02b266a01 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -97,6 +97,10 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src, void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float *dst, MatrixDim d, int src_stride, int A_trans); +void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src, + MatrixDim src_dim, double *dst, MatrixDim dst_dim); +void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src, + MatrixDim src_dim, float *dst, MatrixDim dst_dim); void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, int mat2_row_stride, int mat2_col_stride, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 6df0e5af9db..b1a9bb1819a 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -578,6 +578,22 @@ static void _add_mat_blocks(Real alpha, const Real* src, } } +template +__global__ +static void _add_mat_repeated(Real alpha, const Real* src, + MatrixDim src_dim, Real* dst, + MatrixDim dst_dim) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda src_i = i % src_dim.cols, + src_j = j % src_dim.rows, + dst_index = i + j * dst_dim.stride, + src_index = src_i + src_j * src_dim.stride; + if (i < dst_dim.cols && j < dst_dim.rows) + dst[dst_index] += alpha * src[src_index]; +} + + template __global__ static void _add_mat_blocks_trans(Real alpha, const Real* src, @@ -3558,6 +3574,12 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, } } +void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src, + MatrixDim src_dim, float *dst, MatrixDim dst_dim) { + _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); +} + + void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { @@ -4217,6 +4239,11 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, } } +void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src, + MatrixDim src_dim, double *dst, MatrixDim dst_dim) { + _add_mat_repeated<<>>(alpha, src, src_dim, dst, dst_dim); +} + void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index d2a79f471c8..a2c4aaceb3d 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -158,6 +158,16 @@ inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } +inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, + const double *src, MatrixDim src_dim, + double *dst, MatrixDim dst_dim) { + cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); +} +inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, + const float *src, MatrixDim src_dim, + float *dst, MatrixDim dst_dim) { + cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim); +} inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, int mat2_row_stride, int mat2_col_stride, diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 2157c97156f..5e49b483c61 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -1223,39 +1223,95 @@ static void UnitTestCuMatrixAddMat() { } } -template -static void UnitTestCuMatrixAddMatBlocks() { - int32 num_row_blocks = 10, num_col_blocks = 20; - Matrix Ha1(100, 100), Ha2(100, 100); - Matrix Hb(100 * num_row_blocks, 100 * num_col_blocks); - Ha1.SetRandn(); - Ha2.SetRandn(); - Hb.SetRandn(); - CuMatrix Da1(100, 100), Da2(100, 100); - CuMatrix Db(100 * num_row_blocks, 100 * num_col_blocks); - Da1.CopyFromMat(Ha1); - Da2.CopyFromMat(Ha2); - Db.CopyFromMat(Hb); +// this tests the branch of AddMatBlocks() that is taken when +// 'this' has a smaller dimension than 'src' (it sums). +template +static void UnitTestCuMatrixAddMatBlocks1() { + for (int32 l = 0; l < 5; l++) { + int32 num_row_blocks = RandInt(1, 10), num_col_blocks = RandInt(1, 20); + int32 block_rows = RandInt(1, 100), block_cols = RandInt(1, 100); + BaseFloat alpha = RandInt(3, 10); + CuMatrix dst(block_rows, block_cols); + dst.SetRandn(); + CuMatrix src(num_row_blocks * block_rows, + num_col_blocks * block_cols); + src.SetRandn(); - for (int32 i = 0; i < num_row_blocks; i++) { - for (int32 j = 0; j < num_col_blocks; j++) { - SubMatrix Hs(Hb.Range(i * 100, 100, j * 100, 100)); - Ha1.AddMat(0.5, Hs, kNoTrans); - Ha2.AddMat(0.5, Hs, kTrans); + CuMatrix dst_copy(dst); + for (int32 rb = 0; rb < num_row_blocks; rb++) { + for (int32 cb = 0; cb < num_col_blocks; cb++) { + CuSubMatrix src_part(src, + rb * block_rows, block_rows, + cb * block_cols, block_cols); + dst_copy.AddMat(alpha, src_part); + } } + dst.AddMatBlocks(alpha, src); + AssertEqual(dst, dst_copy); } +} - Da1.AddMatBlocks(0.5, Db, kNoTrans); - Da2.AddMatBlocks(0.5, Db, kTrans); - Matrix Ha11(100, 100); - Da1.CopyToMat(&Ha11); - AssertEqual(Ha1,Ha11); - Matrix Ha22(100, 100); - Da2.CopyToMat(&Ha22); - AssertEqual(Ha2,Ha22); +// this is as UnitTestCuMatrixAddMatBlocks1, but tests with transpose. +template +static void UnitTestCuMatrixAddMatBlocks1Trans() { + for (int32 l = 0; l < 5; l++) { + int32 num_row_blocks = RandInt(1, 10), num_col_blocks = RandInt(1, 20); + int32 block_rows = RandInt(1, 100), block_cols = RandInt(1, 100); + BaseFloat alpha = RandInt(3, 10); + CuMatrix dst(block_cols, block_rows); + dst.SetRandn(); + CuMatrix src(num_row_blocks * block_rows, + num_col_blocks * block_cols); + src.SetRandn(); + + CuMatrix dst_copy(dst); + for (int32 rb = 0; rb < num_row_blocks; rb++) { + for (int32 cb = 0; cb < num_col_blocks; cb++) { + CuSubMatrix src_part(src, + rb * block_rows, block_rows, + cb * block_cols, block_cols); + dst_copy.AddMat(alpha, src_part, kTrans); + } + } + dst.AddMatBlocks(alpha, src, kTrans); + AssertEqual(dst, dst_copy); + } } + +// this tests the branch of AddMatBlocks() that is taken when +// 'this' has a larger dimension than 'src'. In this case, it does +// a broadcasting rather than a summing operation. +template +static void UnitTestCuMatrixAddMatBlocks2() { + for (int32 l = 0; l < 5; l++) { + int32 num_row_blocks = RandInt(1, 10), num_col_blocks = RandInt(1, 20); + int32 block_rows = RandInt(1, 100), block_cols = RandInt(1, 100); + BaseFloat alpha = RandInt(3, 10); + CuMatrix src(block_rows, block_cols); + src.SetRandn(); + CuMatrix dst(num_row_blocks * block_rows, + num_col_blocks * block_cols); + src.SetRandn(); + + CuMatrix dst_copy(dst); + for (int32 rb = 0; rb < num_row_blocks; rb++) { + for (int32 cb = 0; cb < num_col_blocks; cb++) { + CuSubMatrix dst_copy_part(dst_copy, + rb * block_rows, block_rows, + cb * block_cols, block_cols); + dst_copy_part.AddMat(alpha, src); + } + } + dst.AddMatBlocks(alpha, src); + AssertEqual(dst, dst_copy); + } +} + + + + template static void UnitTestCuMatrixReduceSum() { int32 M = 100 + Rand() % 300, N = 100 + Rand() % 300; @@ -2646,7 +2702,9 @@ template void CudaMatrixUnitTest() { UnitTestCuMatrixMulRowsVec(); UnitTestCuMatrixDivRowsVec(); UnitTestCuMatrixAddMat(); - UnitTestCuMatrixAddMatBlocks(); + UnitTestCuMatrixAddMatBlocks1(); + UnitTestCuMatrixAddMatBlocks1Trans(); + UnitTestCuMatrixAddMatBlocks2(); UnitTestCuMatrixReduceSum(); UnitTestCuMatrixReduceMax(); UnitTestCuMatrixReduceMin(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index cfa570233c3..91e140e6bcd 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -912,6 +912,7 @@ void CuMatrixBase::DivRowsVec(const CuVectorBase &div) { } } + template void CuMatrixBase::InvertElements() { #if HAVE_CUDA == 1 @@ -969,43 +970,81 @@ template void CuMatrixBase::AddMatBlocks(Real alpha, const CuMatrixBase &A, MatrixTransposeType transA) { if (num_rows_ == 0 || num_cols_ == 0) return; - int32 num_row_blocks, num_col_blocks; - if (transA == kNoTrans) { - KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0); - num_row_blocks = A.Mat().NumRows() / num_rows_; - num_col_blocks = A.Mat().NumCols() / num_cols_; - } else { - KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0); - num_row_blocks = A.Mat().NumRows() / num_cols_; - num_col_blocks = A.Mat().NumCols() / num_rows_; - } -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimGrid, dimBlock; - GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), - &dimGrid, &dimBlock); - cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks, - num_col_blocks, data_, Dim(), A.Stride(), - (transA == kTrans ? 1 : 0)); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - int32 nr, nc; + if (A.NumRows() >= num_rows_ && A.NumCols() >= num_cols_) { + // This is the "summing", not broadcasting, version of AddMatBlocks. + // It supports both regular and transposed operation. + int32 num_row_blocks, num_col_blocks; if (transA == kNoTrans) { - nr = num_rows_; - nc = num_cols_; + KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0); + num_row_blocks = A.Mat().NumRows() / num_rows_; + num_col_blocks = A.Mat().NumCols() / num_cols_; } else { - nr = num_cols_; - nc = num_rows_; + KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0); + num_row_blocks = A.Mat().NumRows() / num_cols_; + num_col_blocks = A.Mat().NumCols() / num_rows_; + } +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks, + num_col_blocks, data_, Dim(), A.Stride(), + (transA == kTrans ? 1 : 0)); + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + int32 nr, nc; + if (transA == kNoTrans) { + nr = num_rows_; + nc = num_cols_; + } else { + nr = num_cols_; + nc = num_rows_; + } + for (int32 i = 0; i < num_row_blocks; i++) { + for (int32 j = 0; j < num_col_blocks; j++) { + Mat().AddMat(alpha, SubMatrix(A.Mat(), i * nr, nr, j * nc, nc), + transA); + } + } } - for (int32 i = 0; i < num_row_blocks; i++) { - for (int32 j = 0; j < num_col_blocks; j++) { - Mat().AddMat(alpha, SubMatrix(A.Mat(), i * nr, nr, j * nc, nc), - transA); + } else { + // This is the "broadcasting" version of AddMatBlocks, where + // *this is larger than src. + if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0)) + KALDI_ERR << "Invalid sizes of arguments"; + if (transA != kNoTrans) + KALDI_ERR << "Transposed operation not supported currently."; +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_add_mat_repeated(dimGrid, dimBlock, alpha, + A.data_, A.Dim(), data_, Dim()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + const MatrixBase &src_mat = A.Mat(), + &this_mat = this->Mat(); + for (int32 row_offset = 0; row_offset < NumRows(); + row_offset += src_mat.NumRows()) { + for (int32 col_offset = 0; col_offset < NumCols(); + col_offset += src_mat.NumCols()) { + SubMatrix this_part(this_mat, + row_offset, src_mat.NumRows(), + col_offset, src_mat.NumCols()); + this_part.AddMat(alpha, src_mat); + } } } } diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 0a4c4b0669e..e8823793cc3 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -421,9 +421,25 @@ class CuMatrixBase { void AddMat(Real alpha, const CuMatrixBase &A, MatrixTransposeType trans = kNoTrans); - /// if A.NumRows() is multiple of (*this)->NumRows and A.NumCols() is multiple of (*this)->NumCols - /// divide A into blocks of the same size as (*this) and add them to *this (times alpha) - void AddMatBlocks(Real alpha, const CuMatrixBase &A, MatrixTransposeType trans = kNoTrans); + + /// This function is like AddMat (it does *this += alpha * src), + /// except that it supports cases where *this and src have + /// different dimension. There are two allowed cases: + /// + /// (1) *this is larger than src; we do a broadcasting operation. *this must + /// have NumRows() == a * src.NumRows() and NumCols() == b * + /// src.NumCols() for integer a >= 1, b >= 1. *this will be treated as + /// a being made up of of blocks with the same size as src, and to each + /// block we'll add alpha * src. This case does not support trans == + /// kTrans. + /// + /// (2) *this is smaller than src; we sum. src.NumRows() must == a * + /// this->NumRows(), and src.NumCols() must == b * this->NumCols(), for a + /// >= 1, b >= 1. In this case, src will be treated as being made up of + /// blocks with the same size as *this, and to *this we will add the + /// summation of all of those blocks. + void AddMatBlocks(Real alpha, const CuMatrixBase &A, + MatrixTransposeType trans = kNoTrans); /// (for each column c of *this), c = alpha * col + beta * c void AddVecToCols(Real alpha, const CuVectorBase &col, Real beta = 1.0); diff --git a/src/featbin/copy-feats.cc b/src/featbin/copy-feats.cc index 8f94f27d4dd..29fbf2c3be0 100644 --- a/src/featbin/copy-feats.cc +++ b/src/featbin/copy-feats.cc @@ -52,7 +52,7 @@ int main(int argc, char *argv[]) { "(only currently supported for wxfilename, i.e. archive/script," "output)"); po.Register("compression-method", &compression_method_in, - "Only relevant if --compress=true; the method (1 through 6) to " + "Only relevant if --compress=true; the method (1 through 7) to " "compress the matrix. Search for CompressionMethod in " "src/matrix/compressed-matrix.h."); po.Register("write-num-frames", &num_frames_wspecifier, diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc index 89869c39dbd..5c0187a267a 100644 --- a/src/nnet3/nnet-combine.cc +++ b/src/nnet3/nnet-combine.cc @@ -222,7 +222,7 @@ void NnetCombiner::PrintParams(const VectorBase ¶ms) const { int32 num_effective_nnets = nnet_params_.NumRows(); if (num_effective_nnets != num_real_input_nnets_) KALDI_LOG << "Above, only " << num_effective_nnets << " weights were " - "printed due to the the --num-effective-nnets option; " + "printed due to the the --max-effective-inputs option; " "there were " << num_real_input_nnets_ << " actual input nnets. " "Each weight corresponds to a weighted average over a range of " "nnets in the sequence (with triangular bins)"; diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 8a8a10d9475..19b86bbd482 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -99,8 +99,6 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new NormalizeComponent(); } else if (component_type == "PnormComponent") { ans = new PnormComponent(); - } else if (component_type == "SumReduceComponent") { - ans = new SumReduceComponent(); } else if (component_type == "AffineComponent") { ans = new AffineComponent(); } else if (component_type == "NaturalGradientAffineComponent") { @@ -161,6 +159,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new BatchNormComponent(); } else if (component_type == "TimeHeightConvolutionComponent") { ans = new TimeHeightConvolutionComponent(); + } else if (component_type == "SumBlockComponent") { + ans = new SumBlockComponent(); } if (ans != NULL) { KALDI_ASSERT(component_type == ans->Type()); diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index 1b07befdf95..d8ed2380143 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -87,6 +87,32 @@ std::string TimeHeightConvolutionComponent::Info() const { } +void TimeHeightConvolutionComponent::InitUnit() { + if (model_.num_filters_in != model_.num_filters_out) { + KALDI_ERR << "You cannot specify init-unit if the num-filters-in " + << "and num-filters-out differ."; + } + size_t i; + int32 zero_offset = 0; + for (i = 0; i < model_.offsets.size(); i++) { + if (model_.offsets[i].time_offset == 0 && + model_.offsets[i].height_offset == 0) { + zero_offset = i; + break; + } + } + if (i == model_.offsets.size()) // did not break. + KALDI_ERR << "You cannot specify init-unit if the model does " + << "not have the offset (0, 0)."; + + CuSubMatrix zero_offset_block( + linear_params_, 0, linear_params_.NumRows(), + zero_offset * model_.num_filters_in, model_.num_filters_in); + + KALDI_ASSERT(zero_offset_block.NumRows() == zero_offset_block.NumCols()); + zero_offset_block.AddToDiag(1.0); // set this block to the unit matrix. +} + void TimeHeightConvolutionComponent::InitFromConfig(ConfigLine *cfl) { // 1. Config values inherited from UpdatableComponent. InitLearningRatesFromConfig(cfl); @@ -169,16 +195,22 @@ void TimeHeightConvolutionComponent::InitFromConfig(ConfigLine *cfl) { // 3. Parameter-initialization configs. BaseFloat param_stddev = -1, bias_stddev = 0.0; + bool init_unit = false; cfl->GetValue("param-stddev", ¶m_stddev); cfl->GetValue("bias-stddev", &bias_stddev); + cfl->GetValue("init-unit", &init_unit); if (param_stddev < 0.0) { param_stddev = 1.0 / sqrt(model_.num_filters_in * model_.offsets.size()); } // initialize the parameters. linear_params_.Resize(model_.ParamRows(), model_.ParamCols()); - linear_params_.SetRandn(); - linear_params_.Scale(param_stddev); + if (!init_unit) { + linear_params_.SetRandn(); + linear_params_.Scale(param_stddev); + } else { + InitUnit(); + } bias_params_.Resize(model_.num_filters_out); bias_params_.SetRandn(); bias_params_.Scale(bias_stddev); diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index a48987213af..59442504444 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -120,6 +120,12 @@ namespace nnet3 { filters; this value will ensure that the output has unit stddev if the input has unit stddev. bias-stddev Standard deviation of bias terms. default=0.0. + init-unit Defaults to false. If true, it is required that + num-filters-in equal num-filters-out and there should + exist a (height, time) offset in the model equal to (0, + 0). We will initialize the parameter matrix to be + equivalent to the identity transform. In this case, + param-stddev is ignored. Natural-gradient related options are below; you won't normally have to @@ -308,7 +314,9 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { const CuMatrixBase &in_value, const CuMatrixBase &out_deriv); - + // Function called to initialize linear_params_ if init-unit=true in the config + // line. + void InitUnit(); time_height_convolution::ConvolutionModel model_; diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 27482678235..da19b477337 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -217,84 +217,10 @@ void DropoutComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteBasicType(os, binary, dropout_per_frame_); WriteToken(os, binary, ""); - WriteBasicType(os, binary, test_mode_); + WriteBasicType(os, binary, test_mode_); WriteToken(os, binary, ""); } -void SumReduceComponent::Init(int32 input_dim, int32 output_dim) { - input_dim_ = input_dim; - output_dim_ = output_dim; - KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0 && - input_dim_ % output_dim_ == 0); -} - -void SumReduceComponent::InitFromConfig(ConfigLine *cfl) { - int32 input_dim = 0; - int32 output_dim = 0; - bool ok = cfl->GetValue("output-dim", &output_dim) && - cfl->GetValue("input-dim", &input_dim); - if (!ok || cfl->HasUnusedValues() || output_dim <= 0) - KALDI_ERR << "Invalid initializer for layer of type " - << Type() << ": \"" << cfl->WholeLine() << "\""; - Init(input_dim, output_dim); -} - - -void* SumReduceComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - KALDI_ASSERT(out->NumRows() == in.NumRows() && in.NumCols() == input_dim_ - && out->NumCols() == output_dim_); - int32 num_blocks = input_dim_ / output_dim_; - for (int32 i = 0; i < num_blocks; i++) { - CuSubMatrix in_block(in, 0, in.NumRows(), - i * output_dim_, output_dim_); - if (i == 0) - out->CopyFromMat(in_block); - else - out->AddMat(1.0, in_block); - } - return NULL; -} - -void SumReduceComponent::Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &, // in_value - const CuMatrixBase &, // out_value - const CuMatrixBase &out_deriv, - void *memo, - Component *, // to_update - CuMatrixBase *in_deriv) const { - if (!in_deriv) return; - KALDI_ASSERT(out_deriv.NumRows() == in_deriv->NumRows() && - in_deriv->NumCols() == input_dim_ && - out_deriv.NumCols() == output_dim_); - int32 num_blocks = input_dim_ / output_dim_; - for (int32 i = 0; i < num_blocks; i++) { - CuSubMatrix in_deriv_block(*in_deriv, 0, in_deriv->NumRows(), - i * output_dim_, output_dim_); - in_deriv_block.CopyFromMat(out_deriv); - } -} - -void SumReduceComponent::Read(std::istream &is, bool binary) { - ExpectOneOrTwoTokens(is, binary, "", ""); - ReadBasicType(is, binary, &input_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &output_dim_); - ExpectToken(is, binary, ""); -} - -void SumReduceComponent::Write(std::ostream &os, bool binary) const { - WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, output_dim_); - WriteToken(os, binary, ""); -} - - void ElementwiseProductComponent::Init(int32 input_dim, int32 output_dim) { input_dim_ = input_dim; output_dim_ = output_dim; @@ -5873,5 +5799,78 @@ void BatchNormComponent::ZeroStats() { } +SumBlockComponent::SumBlockComponent(const SumBlockComponent &other): + input_dim_(other.input_dim_), output_dim_(other.output_dim_), + scale_(other.scale_) { } + +void SumBlockComponent::InitFromConfig(ConfigLine *cfl) { + scale_ = 1.0; + bool ok = cfl->GetValue("input-dim", &input_dim_) && + cfl->GetValue("output-dim", &output_dim_); + if (!ok) + KALDI_ERR << "input-dim and output-dim must both be provided."; + if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0) + KALDI_ERR << "Invalid values input-dim=" << input_dim_ + << " output-dim=" << output_dim_; + cfl->GetValue("scale", &scale_); + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); +} + +void SumBlockComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &input_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &output_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &scale_); + ExpectToken(is, binary, ""); +} + +void SumBlockComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, output_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, scale_); + WriteToken(os, binary, ""); +} + +std::string SumBlockComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", input-dim=" << input_dim_ + << ", output-dim=" << output_dim_ + << ", scale=" << scale_; + return stream.str(); +} + +void* SumBlockComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(out->NumRows() == in.NumRows() && + out->NumCols() == output_dim_ && + in.NumCols() == input_dim_); + out->AddMatBlocks(scale_, in, kNoTrans); + return NULL; +} + +void SumBlockComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, //in_value + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + if (in_deriv) { + in_deriv->AddMatBlocks(scale_, out_deriv, kNoTrans); + } +} + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index a640470098e..4af8649515f 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -424,54 +424,6 @@ class RectifiedLinearComponent: public NonlinearComponent { RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow. }; -/** - This component is a fixed (non-trainable) nonlinearity that sums its inputs - to produce outputs. Currently the only supported configuration is that its - input-dim is interpreted as consisting of n blocks, and the output is just a - summation over the n blocks, where n = input-dim / output-dim, so for instance - output[n] = input[n] + input[block-size + n] + .... . - Later if needed we can add a configuration variable that allows you to sum - over 'interleaved' input. - */ -class SumReduceComponent: public Component { - public: - void Init(int32 input_dim, int32 output_dim); - explicit SumReduceComponent(int32 input_dim, int32 output_dim) { - Init(input_dim, output_dim); - } - virtual int32 Properties() const { - return kSimpleComponent|kLinearInInput; - } - SumReduceComponent(): input_dim_(0), output_dim_(0) { } - virtual std::string Type() const { return "SumReduceComponent"; } - virtual void InitFromConfig(ConfigLine *cfl); - virtual int32 InputDim() const { return input_dim_; } - virtual int32 OutputDim() const { return output_dim_; } - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &, // in_value - const CuMatrixBase &, // out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *, // to_update - CuMatrixBase *in_deriv) const; - virtual Component* Copy() const { return new SumReduceComponent(input_dim_, - output_dim_); } - - virtual void Read(std::istream &is, bool binary); // This Read function - // requires that the Component has the correct type. - - /// Write component to stream - virtual void Write(std::ostream &os, bool binary) const; - - protected: - int32 input_dim_; - int32 output_dim_; -}; - class FixedAffineComponent; class FixedScaleComponent; @@ -1119,8 +1071,10 @@ class FixedBiasComponent: public Component { KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent); }; -// NoOpComponent just duplicates its input. We don't anticipate this being used -// very often, but it may sometimes make your life easier +/** NoOpComponent just duplicates its input. We don't anticipate this being used + very often, but it may sometimes make your life easier + The only config parameter it accepts is 'dim', e.g. 'dim=400'. +*/ class NoOpComponent: public NonlinearComponent { public: explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { } @@ -1145,6 +1099,54 @@ class NoOpComponent: public NonlinearComponent { NoOpComponent &operator = (const NoOpComponent &other); // Disallow. }; +/** SumBlockComponent sums over blocks of its input: for instance, if + you create one with the config "input-dim=400 output-dim=100", + its output will be the sum over the 4 100-dimensional blocks of + the input. + + The "scale" config parameter may be used if you want to do averaging + instead of summing, e.g. "input-dim=400 output-dim=100 scale=0.25" + will accomplish averaging. + + Accepted values on its config-file line are: + input-dim The input dimension. Required. + output-dim The block dimension. Required. Must divide input-dim. + scale A scaling factor on the output. Defaults to 1.0. + */ +class SumBlockComponent: public Component { + public: + explicit SumBlockComponent(const SumBlockComponent &other); + SumBlockComponent() { } + virtual std::string Type() const { return "SumBlockComponent"; } + virtual int32 Properties() const { + return kSimpleComponent|kLinearInInput|kPropagateAdds|kBackpropAdds; + } + virtual void InitFromConfig(ConfigLine *cfl); + virtual int32 InputDim() const { return input_dim_; } + virtual int32 OutputDim() const { return output_dim_; } + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + virtual std::string Info() const; + virtual Component* Copy() const { return new SumBlockComponent(*this); } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, //in_value + const CuMatrixBase &, // out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + private: + int32 input_dim_; + int32 output_dim_; + BaseFloat scale_; + SumBlockComponent &operator = (const SumBlockComponent &other); // Disallow. +}; + + // ClipGradientComponent just duplicates its input, but clips gradients // during backpropagation if they cross a predetermined threshold. // This component will be used to prevent gradient explosion problem in @@ -1415,9 +1417,23 @@ class PerElementScaleComponent: public UpdatableComponent { CuVector scales_; }; +/* + PerElementOffsetComponent offsets each dimension of its input with a separate + trainable bias; it's like an affine component with fixed weight matrix which + is always equal to I. + + Accepted values on its config line, with defaults if applicable. + + vector If specified, the offsets will be read from this file ('vector' + is interpreted as an rxfilename). -// PerElementOffsetComponent offsets each dimension of its input with a separate -// trainable bias; it's like an affine component with fixed weight matrix which is always equal to I. + dim If 'vector' is not specified, you should specify the + dimension 'dim', and will be randomly initialized according + to 'param-mean' and 'param-stddev'. + param-mean=0.0 Mean of randomly initialized offset parameters. + param-stddev=0.0 Standard deviation of randomly initialized offset parameters. + +*/ class PerElementOffsetComponent: public UpdatableComponent { public: virtual int32 InputDim() const { return offsets_.Dim(); } diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index e1d58b34428..a138fcacceb 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -1288,7 +1288,7 @@ void ComputeExampleComputationRequestSimple( static void GenerateRandomComponentConfig(std::string *component_type, std::string *config) { - int32 n = RandInt(0, 33); + int32 n = RandInt(0, 32); BaseFloat learning_rate = 0.001 * RandInt(1, 100); std::ostringstream os; @@ -1463,14 +1463,7 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " learning-rate=" << learning_rate << param_config; break; } - case 20: { - *component_type = "SumReduceComponent"; - int32 output_dim = RandInt(1, 50), group_size = RandInt(1, 15), - input_dim = output_dim * group_size; - os << "input-dim=" << input_dim << " output-dim=" << output_dim; - break; - } - case 21: { + case 20: case 21: { *component_type = "CompositeComponent"; int32 cur_dim = RandInt(20, 30), num_components = RandInt(1, 3), max_rows_process = RandInt(1, 30); @@ -1598,7 +1591,7 @@ static void GenerateRandomComponentConfig(std::string *component_type, } // I think we'll get in the habit of allocating a larger number of case // labels to the most recently added component, so it gets tested more - case 31: case 32: case 33: { + case 31: { *component_type = "BatchNormComponent"; int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2); bool test_mode = (RandInt(0, 1) == 0); @@ -1608,6 +1601,16 @@ static void GenerateRandomComponentConfig(std::string *component_type, << (test_mode ? "true" : "false"); break; } + case 32: { + *component_type = "SumBlockComponent"; + BaseFloat scale = 0.5 * RandInt(1, 3); + BaseFloat output_dim = RandInt(1, 10), + input_dim = output_dim * RandInt(1, 3); + os << "input-dim=" << input_dim + << " output-dim=" << output_dim + << " scale=" << scale; + break; + } default: KALDI_ERR << "Error generating random component"; }