diff --git a/egs/cifar/v1/local/nnet3/run_cnn_1d.sh b/egs/cifar/v1/local/nnet3/run_cnn_1d.sh new file mode 100755 index 00000000000..6baad31fcbb --- /dev/null +++ b/egs/cifar/v1/local/nnet3/run_cnn_1d.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +# 1d is as 1c but adding batch-norm to all convolutional layers. +# batch-norm helps (0.78 -> 0.8). + +# exp/cnn1d_cifar10: num-iters=60 nj=1..2 num-params=4.3M dim=96->10 combine=-0.10->-0.08 loglike:train/valid[39,59,final]=(-0.03,-0.00,-0.00/-0.63,-0.69,-0.68) accuracy:train/valid[39,59,final]=(1.00,1.00,1.00/0.81,0.82,0.82) + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + + + +# training options +stage=0 +train_stage=-10 +dataset=cifar10 +srand=0 +reporting_email= +affix=1d + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=96 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=32 height-out=32 time-offsets=-1,0,1 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=32 height-out=16 time-offsets=-1,0,1 dropout-proportion=0.25 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=16 height-out=16 time-offsets=-1,0,1 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=16 height-out=8 time-offsets=-1,0,1 dropout-proportion=0.25 $common2 height-subsample-out=2 + relu-dropout-layer name=fully_connected1 input=Append(0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30) dropout-proportion=0.5 dim=512 + output-layer name=output dim=$num_targets +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 2 ]; then + + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=30 \ + --egs.frames-per-eg=1 \ + --trainer.optimization.num-jobs-initial=1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate=0.003 \ + --trainer.optimization.final-effective-lrate=0.0003 \ + --trainer.optimization.minibatch-size=256,128,64 \ + --trainer.shuffle-buffer-size=2000 \ + --egs.dir="$egs" \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + + +exit 0; diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 4341be630aa..0ab4a5e5f63 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -37,6 +37,7 @@ 'conv-relu-renorm-layer': xlayers.XconfigConvLayer, 'relu-conv-batchnorm-layer': xlayers.XconfigConvLayer, 'conv-relu-batchnorm-layer': xlayers.XconfigConvLayer, + 'conv-relu-batchnorm-dropout-layer': xlayers.XconfigConvLayer, 'conv-relu-dropout-layer': xlayers.XconfigConvLayer, 'relu-dropout-layer': xlayers.XconfigBasicLayer diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index 85e328709e0..5e3451b073d 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -41,6 +41,8 @@ int main(int argc, char *argv[]) { " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n"; bool binary_write = true; + bool batchnorm_test_mode = false, + dropout_test_mode = true; std::string use_gpu = "yes"; NnetCombineConfig combine_config; chain::ChainTrainingOptions chain_config; @@ -49,6 +51,11 @@ int main(int argc, char *argv[]) { po.Register("binary", &binary_write, "Write output in binary mode"); po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("batchnorm-test-mode", &batchnorm_test_mode, + "If true, set test-mode to true on any BatchNormComponents."); + po.Register("dropout-test-mode", &dropout_test_mode, + "If true, set test-mode to true on any DropoutComponents and " + "DropoutMaskComponents."); combine_config.Register(&po); chain_config.Register(&po); @@ -77,13 +84,10 @@ int main(int argc, char *argv[]) { Nnet nnet; ReadKaldiObject(raw_nnet_rxfilename, &nnet); - // This is needed for batch-norm. We also ensure in the calling script - // that the freshest model comes first on the command line; this - // means we use the freshest batch-norm stats. (Since the batch-norm - // stats are not technically parameters, they are not subject to - // combination like the rest of the model parameters). - SetBatchnormTestMode(true, &nnet); - SetDropoutTestMode(true, &nnet); + if (batchnorm_test_mode) + SetBatchnormTestMode(true, &nnet); + if (dropout_test_mode) + SetDropoutTestMode(true, &nnet); std::vector egs; egs.reserve(10000); // reserve a lot of space to minimize the chance of diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc index 49827490fab..8cf25d4ad08 100644 --- a/src/chainbin/nnet3-chain-compute-prob.cc +++ b/src/chainbin/nnet3-chain-compute-prob.cc @@ -52,7 +52,6 @@ int main(int argc, char *argv[]) { po.Register("batchnorm-test-mode", &batchnorm_test_mode, "If true, set test-mode to true on any BatchNormComponents."); - po.Register("dropout-test-mode", &dropout_test_mode, "If true, set test-mode to true on any DropoutComponents and " "DropoutMaskComponents."); diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc index d6c376ab45e..2dd9b049d9b 100644 --- a/src/nnet3/nnet-chain-combine.cc +++ b/src/nnet3/nnet-chain-combine.cc @@ -19,6 +19,7 @@ #include "nnet3/nnet-chain-combine.h" #include "nnet3/nnet-utils.h" +#include "nnet3/nnet-chain-training.h" namespace kaldi { namespace nnet3 { @@ -38,7 +39,6 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config, nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs), NumParameters(first_nnet)), tot_input_weighting_(nnet_params_.NumRows()) { - SetDropoutProportion(0, &nnet_); if (combine_config_.sum_to_one_penalty != 0.0 && combine_config_.enforce_sum_to_one) { @@ -182,6 +182,18 @@ void NnetChainCombiner::Combine() { ComputeObjfAndDerivFromParameters(final_params, &deriv); } PrintParams(final_params); + if (HasBatchnorm(nnet_)) { + RecomputeBatchnormStats(); + } +} + +void NnetChainCombiner::RecomputeBatchnormStats() { + KALDI_LOG << "Recomputing batch-norm stats on nnet."; + NnetChainTrainingOptions train_opts; + train_opts.nnet_config.train = false; + NnetChainTrainer trainer(train_opts, den_fst_, &nnet_); + for (size_t i = 0; i < egs_.size(); i++) + trainer.Train(egs_[i]); } diff --git a/src/nnet3/nnet-chain-combine.h b/src/nnet3/nnet-chain-combine.h index 3aeb3882650..4f4ed55202d 100644 --- a/src/nnet3/nnet-chain-combine.h +++ b/src/nnet3/nnet-chain-combine.h @@ -194,6 +194,7 @@ class NnetChainCombiner { void ComputeUpdatableComponentDims(); void FinishPreprocessingInput(); + void RecomputeBatchnormStats(); }; diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index e955710fff6..1a6bebd0e3d 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -58,8 +58,8 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { - bool need_model_derivative = true; const NnetTrainerOptions &nnet_config = opts_.nnet_config; + bool need_model_derivative = nnet_config.train; bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0); ComputationRequest request; GetChainComputationRequest(*nnet_, chain_eg, need_model_derivative, @@ -73,16 +73,21 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { // give the inputs to the computer object. computer.AcceptInputs(*nnet_, chain_eg.inputs); computer.Run(); - this->ProcessOutputs(chain_eg, &computer); - computer.Run(); - - UpdateParamsWithMaxChange(); + if (nnet_config.train) { + computer.Run(); + UpdateParamsWithMaxChange(); + } else { + // all parameter derivs will be zero; here we're just adding the stored stats. + AddNnet(*delta_nnet_, 1.0, nnet_); + ScaleNnet(0.0, delta_nnet_); + } } void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg, NnetComputer *computer) { + bool train = opts_.nnet_config.train; // normally the eg will have just one output named 'output', but // we don't assume this. std::vector::const_iterator iter = eg.outputs.begin(), @@ -111,7 +116,7 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg, ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, sup.supervision, nnet_output, &tot_objf, &tot_l2_term, &tot_weight, - &nnet_output_deriv, + (train ? &nnet_output_deriv : NULL), (use_xent ? &xent_deriv : NULL)); if (use_xent) { @@ -126,20 +131,21 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg, tot_weight, xent_objf); } - if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) { + if (train && opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) { CuVector cu_deriv_weights(sup.deriv_weights); nnet_output_deriv.MulRowsVec(cu_deriv_weights); if (use_xent) xent_deriv.MulRowsVec(cu_deriv_weights); } - computer->AcceptInput(sup.name, &nnet_output_deriv); + if (train) + computer->AcceptInput(sup.name, &nnet_output_deriv); objf_info_[sup.name].UpdateStats(sup.name, opts_.nnet_config.print_interval, num_minibatches_processed_++, tot_weight, tot_objf, tot_l2_term); - if (use_xent) { + if (train && use_xent) { xent_deriv.Scale(opts_.chain_config.xent_regularize); computer->AcceptInput(xent_name, &xent_deriv); } diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc index ba904b1c93a..c34d2e9631e 100644 --- a/src/nnet3/nnet-combine.cc +++ b/src/nnet3/nnet-combine.cc @@ -41,7 +41,6 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config, << " is nonzero, so setting --enforce-sum-to-one=false."; config_.enforce_sum_to_one = false; } - SetDropoutProportion(0, &nnet_); SubVector first_params(nnet_params_, 0); VectorizeNnet(nnet_, &first_params); tot_input_weighting_(0) += 1.0; @@ -178,6 +177,19 @@ void NnetCombiner::Combine() { ComputeObjfAndDerivFromParameters(final_params, &deriv); } PrintParams(final_params); + + if (HasBatchnorm(nnet_)) { + RecomputeBatchnormStats(); + } +} + +void NnetCombiner::RecomputeBatchnormStats() { + KALDI_LOG << "Recomputing batch-norm stats on nnet."; + NnetTrainerOptions train_opts; + train_opts.train = false; + NnetTrainer trainer(train_opts, &nnet_); + for (size_t i = 0; i < egs_.size(); i++) + trainer.Train(egs_[i]); } diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h index 5b60d30b8ed..fda89027cf9 100644 --- a/src/nnet3/nnet-combine.h +++ b/src/nnet3/nnet-combine.h @@ -240,6 +240,8 @@ class NnetCombiner { void ComputeUpdatableComponentDims(); void FinishPreprocessingInput(); + void RecomputeBatchnormStats(); + }; diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 2a081920738..0bcf79ce662 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -55,7 +55,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config, void NnetTrainer::Train(const NnetExample &eg) { - bool need_model_derivative = true; + bool need_model_derivative = config_.train; ComputationRequest request; GetComputationRequest(*nnet_, eg, need_model_derivative, config_.store_component_stats, @@ -69,9 +69,16 @@ void NnetTrainer::Train(const NnetExample &eg) { computer.Run(); this->ProcessOutputs(eg, &computer); - computer.Run(); - UpdateParamsWithMaxChange(); + if (config_.train) { + computer.Run(); + + UpdateParamsWithMaxChange(); + } else { + // all parameter derivs will be zero; here we're just adding the stored stats. + AddNnet(*delta_nnet_, 1.0, nnet_); + ScaleNnet(0.0, delta_nnet_); + } } void NnetTrainer::ProcessOutputs(const NnetExample &eg, @@ -85,7 +92,7 @@ void NnetTrainer::ProcessOutputs(const NnetExample &eg, if (nnet_->IsOutputNode(node_index)) { ObjectiveType obj_type = nnet_->GetNode(node_index).u.objective_type; BaseFloat tot_weight, tot_objf; - bool supply_deriv = true; + bool supply_deriv = config_.train; ComputeObjectiveFunction(io.features, obj_type, io.name, supply_deriv, computer, &tot_weight, &tot_objf); diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index 64a66612368..46897b51d88 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -33,6 +33,7 @@ namespace nnet3 { struct NnetTrainerOptions { bool zero_component_stats; bool store_component_stats; + bool train; int32 print_interval; bool debug_computation; BaseFloat momentum; @@ -46,6 +47,7 @@ struct NnetTrainerOptions { NnetTrainerOptions(): zero_component_stats(true), store_component_stats(true), + train(true), print_interval(100), debug_computation(false), momentum(0.0), @@ -76,6 +78,10 @@ struct NnetTrainerOptions { "write the cached computation to"); opts->Register("binary-write-cache", &binary_write_cache, "Write " "computation cache in binary mode"); + opts->Register("train", &train, "If true, actually do the training " + "(if false, it will do only the forward propagation, " + "which affects stored stats for batch-norm, among other " + "things.)"); // register the optimization options with the prefix "optimization". ParseOptions optimization_opts("optimization", opts); diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index fe9e9f91997..8950c009b91 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -469,6 +469,16 @@ void SetDropoutProportion(BaseFloat dropout_proportion, } } +bool HasBatchnorm(const Nnet &nnet) { + for (int32 c = 0; c < nnet.NumComponents(); c++) { + const Component *comp = nnet.GetComponent(c); + const BatchNormComponent *bc = + dynamic_cast(comp); + if (bc != NULL) + return true; + } + return false; +} void SetBatchnormTestMode(bool test_mode, Nnet *nnet) { for (int32 c = 0; c < nnet->NumComponents(); c++) { diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 6f645d89c56..0b5faadc1b8 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -164,6 +164,11 @@ std::string NnetInfo(const Nnet &nnet); /// dropout_proportion value. void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet); + +/// Returns true if nnet has at least one component of type +/// BatchNormComponent. +bool HasBatchnorm(const Nnet &nnet); + /// This function affects only components of type BatchNormComponent. /// It sets "test mode" on such components (if you call it with test_mode = /// true, otherwise it would set normal mode, but this wouldn't be needed diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index 7885bb70b6b..30ad338ab4b 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -40,11 +40,18 @@ int main(int argc, char *argv[]) { " nnet3-combine 1.1.raw 1.2.raw 1.3.raw ark:valid.egs 2.raw\n"; bool binary_write = true; + bool batchnorm_test_mode = false, + dropout_test_mode = true; std::string use_gpu = "yes"; NnetCombineConfig combine_config; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("batchnorm-test-mode", &batchnorm_test_mode, + "If true, set test-mode to true on any BatchNormComponents."); + po.Register("dropout-test-mode", &dropout_test_mode, + "If true, set test-mode to true on any DropoutComponents and " + "DropoutMaskComponents."); po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); @@ -69,13 +76,10 @@ int main(int argc, char *argv[]) { Nnet nnet; ReadKaldiObject(nnet_rxfilename, &nnet); - // This is needed for batch-norm. We also ensure in the calling script - // that the freshest model comes first on the command line; this - // means we use the freshest batch-norm stats. (Since the batch-norm - // stats are not technically parameters, they are not subject to - // combination like the rest of the model parameters). - SetBatchnormTestMode(true, &nnet); - SetDropoutTestMode(true, &nnet); + if (batchnorm_test_mode) + SetBatchnormTestMode(true, &nnet); + if (dropout_test_mode) + SetDropoutTestMode(true, &nnet); std::vector egs; egs.reserve(10000); // reserve a lot of space to minimize the chance of @@ -99,14 +103,11 @@ int main(int argc, char *argv[]) { ReadKaldiObject(po.GetArg(1 + n), &nnet); combiner.AcceptNnet(nnet); } - combiner.Combine(); - #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); #endif - WriteKaldiObject(combiner.GetNnet(), nnet_wxfilename, binary_write); } else { KALDI_LOG << "Copying the single input model directly to the output, " diff --git a/src/nnet3bin/nnet3-compute-prob.cc b/src/nnet3bin/nnet3-compute-prob.cc index a67e76976c4..3dc4de3a662 100644 --- a/src/nnet3bin/nnet3-compute-prob.cc +++ b/src/nnet3bin/nnet3-compute-prob.cc @@ -51,7 +51,6 @@ int main(int argc, char *argv[]) { po.Register("batchnorm-test-mode", &batchnorm_test_mode, "If true, set test-mode to true on any BatchNormComponents."); - po.Register("dropout-test-mode", &dropout_test_mode, "If true, set test-mode to true on any DropoutComponents and " "DropoutMaskComponents.");