diff --git a/egs/swbd/s5c/local/xvector/train.sh b/egs/swbd/s5c/local/xvector/train.sh index f0499ee5741..6dca8b99458 100755 --- a/egs/swbd/s5c/local/xvector/train.sh +++ b/egs/swbd/s5c/local/xvector/train.sh @@ -7,10 +7,13 @@ set -e stage=1 -train_stage=1 +train_stage=-10 generate_alignments=true # false if doing ctc training speed_perturb=true - +init_lr=0.003 +final_lr=0.0003 +max_change=2.0 +use_gpu=true feat_dim=40 # this is the MFCC dim we use in the hires features. you can't change it # unless you change local/xvector/prepare_perturbed_data.sh to use a different # MFCC config with a different dimension. @@ -18,6 +21,7 @@ data=data/train_nodup_sp_hires # you can't change this without changing # local/xvector/prepare_perturbed_data.sh xvector_dim=200 # dimension of the xVector. configurable. xvector_dir=exp/xvector_a +egs_dir=exp/xvector_a/egs . ./path.sh @@ -40,18 +44,21 @@ if [ $stage -le 3 ]; then $xvector_dir/nnet.config fi -if [ $stage -le 4 ]; then +if [ $stage -le 4 ] && [ -z "$egs_dir" ]; then # dump egs. steps/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ - "$data" $xvector_dir/egs + "$data" $egs_dir fi if [ $stage -le 5 ]; then # training for 4 epochs * 3 shifts means we see each eg 12 # times (3 different frame-shifts of the same eg are counted as different). steps/nnet3/xvector/train.sh --cmd "$train_cmd" \ - --num-epochs 4 --num-shifts 3 \ - --num-jobs-initial 2 --num-jobs-final 8 \ + --num-epochs 4 --num-shifts 3 --use-gpu $use_gpu --stage $train_stage \ + --initial-effective-lrate $init_lr --final-effective-lrate $final_lr \ + --num-jobs-initial 1 --num-jobs-final 8 \ + --max-param-change $max_change \ + --egs-dir $egs_dir \ $xvector_dir fi diff --git a/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py index 51d58c5b89c..61eb2d41c24 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py +++ b/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py @@ -271,7 +271,7 @@ def WriteConfigs(self, f): # just have an affine component for the first hidden layer. # we don't need a nonlinearity as there is one at the input of # the jesus component. - print('component name=x-affine1 type=AffineComponent ' + print('component name=x-affine1 type=NaturalGradientAffineComponent ' 'input-dim={0} output-dim={1} bias-stddev=0'.format( cur_dim, args.jesus_input_dim), file=f) print('component-node name=x-affine1 component=x-affine1 input={0}'.format( diff --git a/egs/wsj/s5/steps/nnet3/xvector/train.sh b/egs/wsj/s5/steps/nnet3/xvector/train.sh index a05c62c5124..f79c2680b1c 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/train.sh +++ b/egs/wsj/s5/steps/nnet3/xvector/train.sh @@ -9,8 +9,8 @@ cmd=run.pl num_epochs=4 # Number of epochs of training; # the number of iterations is worked out from this. num_shifts=3 -initial_effective_lrate=0.0003 -final_effective_lrate=0.00003 +initial_effective_lrate=0.003 +final_effective_lrate=0.0003 num_jobs_initial=2 # Number of neural net jobs to run in parallel at the start of training num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training stage=-3 @@ -129,7 +129,7 @@ while [ $x -lt $num_iters ]; do if [ $stage -le $x ]; then echo "On iteration $x, learning rate is $this_learning_rate" - + raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - |" # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \ @@ -142,7 +142,7 @@ while [ $x -lt $num_iters ]; do if [ $x -gt 0 ]; then $cmd $dir/log/progress.$x.log \ nnet3-info $dir/$x.raw '&&' \ - nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw & + nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw & fi echo "Training neural net (pass $x)" @@ -174,8 +174,7 @@ while [ $x -lt $num_iters ]; do $cmd $train_queue_opt $dir/log/train.$x.$n.log \ nnet3-xvector-train $parallel_train_opts --print-interval=10 \ - --max-param-change=$max_param_change \ - $dir/$x.raw \ + --max-param-change=$max_param_change "$raw" \ "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --measure-output-frames=false --minibatch-size=$minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ $dir/$[$x+1].$n.raw || touch $dir/.error & done diff --git a/src/xvector/nnet-xvector-training.cc b/src/xvector/nnet-xvector-training.cc index 7327af90d45..5294879e69f 100644 --- a/src/xvector/nnet-xvector-training.cc +++ b/src/xvector/nnet-xvector-training.cc @@ -30,13 +30,14 @@ NnetXvectorTrainer::NnetXvectorTrainer(const NnetTrainerOptions &config, nnet_(nnet), compiler_(*nnet, config_.optimize_config), num_minibatches_processed_(0) { - if (config.zero_component_stats) + if (config_.zero_component_stats) ZeroComponentStats(nnet); - if (config.momentum == 0.0 && config.max_param_change == 0.0) { + if (config_.momentum == 0.0 && + config_.max_param_change == 0.0) { delta_nnet_= NULL; } else { - KALDI_ASSERT(config.momentum >= 0.0 && - config.max_param_change >= 0.0); + KALDI_ASSERT(config_.momentum >= 0.0 && + config_.max_param_change >= 0.0); delta_nnet_ = nnet_->Copy(); bool is_gradient = false; // setting this to true would disable the // natural-gradient updates. @@ -94,7 +95,8 @@ void NnetXvectorTrainer::Train(const NnetExample &eg) { ScaleNnet(config_.momentum, delta_nnet_); } if (config_.write_cache != "") { - Output ko(config_.write_cache, config_.binary_write_cache); + Output ko(config_.write_cache, + config_.binary_write_cache); compiler_.WriteCache(ko.Stream(), config_.binary_write_cache); } } @@ -143,7 +145,8 @@ void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) { computer->AcceptOutputDeriv(b_name, &deriv_b_mat); } - objf_info_[xvector_name].UpdateStats(xvector_name, config_.print_interval, + objf_info_[xvector_name].UpdateStats(xvector_name, + config_.print_interval, num_minibatches_processed_++, tot_weight, tot_objf); } @@ -246,7 +249,7 @@ void GetComputationRequestXvector(const Nnet &nnet, request->need_model_derivative = need_model_derivative; request->store_component_stats = store_component_stats; - // xvector-egs have multiple inputs(e.g. different inputs correspond + // xvector-egs has multiple inputs(e.g. different inputs correspond // to different chunks and no outputs. for (size_t i = 0; i < eg.io.size(); i++) { const NnetIo &io = eg.io[i]; @@ -263,21 +266,34 @@ void GetComputationRequestXvector(const Nnet &nnet, IoSpecification &io_spec = dest.back(); io_spec.name = name; io_spec.indexes = io.indexes; - io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative; + io_spec.has_deriv = false; } // We only need the output on frame t=0 for each n. + // So the output index for output node is (n, 0, 0) + // for n = 0,.., min number of n-values for different t + // in input indexes. + // indexes for "s" and "b" output nodes are equal to (0,0,0). int32 io_index_size = request->inputs[0].indexes.size(), - n_indx_size = 0; + n_indx_size = 1e6, t_ind; std::vector output_indexes, affine_output_indexes; affine_output_indexes.resize(1); affine_output_indexes[0].n = 0; affine_output_indexes[0].t = 0; + + std::map n_indx_sizes; + for (int32 indx = 0; indx < io_index_size; indx++) { + t_ind = request->inputs[0].indexes[indx].t; + if (n_indx_sizes.count(t_ind) != 0) + n_indx_sizes[t_ind] += 1; + else + n_indx_sizes.insert(std::make_pair(t_ind, 1)); + } + std::map::const_iterator iter; + for (iter = n_indx_sizes.begin(); iter != n_indx_sizes.end(); iter++) + n_indx_size = std::min(n_indx_size, iter->second); - for (int32 indx = 0; indx < io_index_size; indx++) - if (request->inputs[0].indexes[indx].t == 0) - n_indx_size++; output_indexes.resize(n_indx_size); for (int32 indx = 0; indx < n_indx_size; indx++) { diff --git a/src/xvector/xvector.cc b/src/xvector/xvector.cc index 604d70e9c14..aab825ba60b 100644 --- a/src/xvector/xvector.cc +++ b/src/xvector/xvector.cc @@ -40,6 +40,9 @@ void ComputeXvectorObjfAndDeriv( KALDI_ASSERT(deriv_xvector->NumCols() == xvector_dim); KALDI_ASSERT(deriv_xvector->NumRows() == N); KALDI_ASSERT(deriv_S->Dim() == S_dim); + deriv_xvector->Set(0.0); + deriv_S->Set(0.0); + (*deriv_b) = 0.0; } CuMatrix S_tmp(S),