diff --git a/egs/swbd/s5c/local/xvector/train.sh b/egs/swbd/s5c/local/xvector/train.sh
index f0499ee5741..6dca8b99458 100755
--- a/egs/swbd/s5c/local/xvector/train.sh
+++ b/egs/swbd/s5c/local/xvector/train.sh
@@ -7,10 +7,13 @@
 set -e
 
 stage=1
-train_stage=1
+train_stage=-10
 generate_alignments=true # false if doing ctc training
 speed_perturb=true
-
+init_lr=0.003
+final_lr=0.0003
+max_change=2.0
+use_gpu=true
 feat_dim=40 # this is the MFCC dim we use in the hires features.  you can't change it
             # unless you change local/xvector/prepare_perturbed_data.sh to use a different
             # MFCC config with a different dimension.
@@ -18,6 +21,7 @@ data=data/train_nodup_sp_hires  # you can't change this without changing
                                 # local/xvector/prepare_perturbed_data.sh
 xvector_dim=200 # dimension of the xVector.  configurable.
 xvector_dir=exp/xvector_a
+egs_dir=exp/xvector_a/egs
 
 
 . ./path.sh
@@ -40,18 +44,21 @@ if [ $stage -le 3 ]; then
       $xvector_dir/nnet.config
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 4 ] && [ -z "$egs_dir" ]; then
   # dump egs.
   steps/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \
-    "$data" $xvector_dir/egs
+    "$data" $egs_dir
 fi
 
 if [ $stage -le 5 ]; then
   # training for 4 epochs * 3 shifts means we see each eg 12
   # times (3 different frame-shifts of the same eg are counted as different).
   steps/nnet3/xvector/train.sh --cmd "$train_cmd" \
-      --num-epochs 4 --num-shifts 3 \
-      --num-jobs-initial 2 --num-jobs-final 8 \
+      --num-epochs 4 --num-shifts 3 --use-gpu $use_gpu --stage $train_stage \
+      --initial-effective-lrate $init_lr --final-effective-lrate $final_lr \
+      --num-jobs-initial 1 --num-jobs-final 8 \
+      --max-param-change $max_change \
+      --egs-dir $egs_dir \
       $xvector_dir
 fi
 
diff --git a/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py
index 51d58c5b89c..61eb2d41c24 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py
@@ -271,7 +271,7 @@ def WriteConfigs(self, f):
         # just have an affine component for the first hidden layer.
         # we don't need a nonlinearity as there is one at the input of
         # the jesus component.
-        print('component name=x-affine1 type=AffineComponent '
+        print('component name=x-affine1 type=NaturalGradientAffineComponent '
               'input-dim={0} output-dim={1} bias-stddev=0'.format(
                 cur_dim, args.jesus_input_dim), file=f)
         print('component-node name=x-affine1 component=x-affine1 input={0}'.format(
diff --git a/egs/wsj/s5/steps/nnet3/xvector/train.sh b/egs/wsj/s5/steps/nnet3/xvector/train.sh
index a05c62c5124..f79c2680b1c 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/train.sh
+++ b/egs/wsj/s5/steps/nnet3/xvector/train.sh
@@ -9,8 +9,8 @@ cmd=run.pl
 num_epochs=4      # Number of epochs of training;
                   # the number of iterations is worked out from this.
 num_shifts=3
-initial_effective_lrate=0.0003
-final_effective_lrate=0.00003
+initial_effective_lrate=0.003
+final_effective_lrate=0.0003
 num_jobs_initial=2 # Number of neural net jobs to run in parallel at the start of training
 num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
 stage=-3
@@ -129,7 +129,7 @@ while [ $x -lt $num_iters ]; do
 
   if [ $stage -le $x ]; then
     echo "On iteration $x, learning rate is $this_learning_rate"
-
+    raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - |"
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \
@@ -142,7 +142,7 @@ while [ $x -lt $num_iters ]; do
     if [ $x -gt 0 ]; then
       $cmd $dir/log/progress.$x.log \
         nnet3-info $dir/$x.raw '&&' \
-        nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw  &
+        nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw &
     fi
 
     echo "Training neural net (pass $x)"
@@ -174,8 +174,7 @@ while [ $x -lt $num_iters ]; do
 
         $cmd $train_queue_opt $dir/log/train.$x.$n.log \
           nnet3-xvector-train $parallel_train_opts --print-interval=10 \
-          --max-param-change=$max_param_change \
-         $dir/$x.raw \
+          --max-param-change=$max_param_change "$raw" \
           "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --measure-output-frames=false --minibatch-size=$minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
diff --git a/src/xvector/nnet-xvector-training.cc b/src/xvector/nnet-xvector-training.cc
index 7327af90d45..5294879e69f 100644
--- a/src/xvector/nnet-xvector-training.cc
+++ b/src/xvector/nnet-xvector-training.cc
@@ -30,13 +30,14 @@ NnetXvectorTrainer::NnetXvectorTrainer(const NnetTrainerOptions &config,
     nnet_(nnet),
     compiler_(*nnet, config_.optimize_config),
     num_minibatches_processed_(0) {
-  if (config.zero_component_stats)
+  if (config_.zero_component_stats)
     ZeroComponentStats(nnet);
-  if (config.momentum == 0.0 && config.max_param_change == 0.0) {
+  if (config_.momentum == 0.0 && 
+      config_.max_param_change == 0.0) {
     delta_nnet_= NULL;
   } else {
-    KALDI_ASSERT(config.momentum >= 0.0 &&
-                 config.max_param_change >= 0.0);
+    KALDI_ASSERT(config_.momentum >= 0.0 &&
+                 config_.max_param_change >= 0.0);
     delta_nnet_ = nnet_->Copy();
     bool is_gradient = false;  // setting this to true would disable the
                                // natural-gradient updates.
@@ -94,7 +95,8 @@ void NnetXvectorTrainer::Train(const NnetExample &eg) {
     ScaleNnet(config_.momentum, delta_nnet_);
   }
   if (config_.write_cache != "") {
-    Output ko(config_.write_cache, config_.binary_write_cache);
+    Output ko(config_.write_cache, 
+      config_.binary_write_cache);
     compiler_.WriteCache(ko.Stream(), config_.binary_write_cache);
   }
 }
@@ -143,7 +145,8 @@ void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) {
           computer->AcceptOutputDeriv(b_name, &deriv_b_mat);
         }
 
-        objf_info_[xvector_name].UpdateStats(xvector_name, config_.print_interval,
+        objf_info_[xvector_name].UpdateStats(xvector_name, 
+                                             config_.print_interval,
                                              num_minibatches_processed_++,
                                              tot_weight, tot_objf);
       }
@@ -246,7 +249,7 @@ void GetComputationRequestXvector(const Nnet &nnet,
   request->need_model_derivative = need_model_derivative;
   request->store_component_stats = store_component_stats;
 
-  // xvector-egs have multiple inputs(e.g. different inputs correspond
+  // xvector-egs has multiple inputs(e.g. different inputs correspond
   // to different chunks and no outputs.
   for (size_t i = 0; i < eg.io.size(); i++) {
     const NnetIo &io = eg.io[i];
@@ -263,21 +266,34 @@ void GetComputationRequestXvector(const Nnet &nnet,
     IoSpecification &io_spec = dest.back();
     io_spec.name = name;
     io_spec.indexes = io.indexes;
-    io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative;
+    io_spec.has_deriv = false; 
   }
 
   // We only need the output on frame t=0 for each n.
+  // So the output index for output node is (n, 0, 0)
+  // for n = 0,.., min number of n-values for different t 
+  // in input indexes.
+  // indexes for "s" and "b" output nodes are equal to (0,0,0).
   int32 io_index_size = request->inputs[0].indexes.size(),
-         n_indx_size = 0;
+         n_indx_size = 1e6, t_ind;
   std::vector<Index> output_indexes, 
     affine_output_indexes;
   affine_output_indexes.resize(1);
   affine_output_indexes[0].n = 0;
   affine_output_indexes[0].t = 0;
+  
+  std::map<int32, int32> n_indx_sizes;
+  for (int32 indx = 0; indx < io_index_size; indx++) {
+    t_ind = request->inputs[0].indexes[indx].t;
+    if (n_indx_sizes.count(t_ind) != 0)
+      n_indx_sizes[t_ind] += 1;
+    else
+      n_indx_sizes.insert(std::make_pair(t_ind, 1));
+  }
+  std::map<int32, int32>::const_iterator iter;
+  for (iter = n_indx_sizes.begin(); iter != n_indx_sizes.end(); iter++)
+    n_indx_size = std::min(n_indx_size, iter->second);
 
-  for (int32 indx = 0; indx < io_index_size; indx++)
-    if (request->inputs[0].indexes[indx].t == 0)
-     n_indx_size++;
 
   output_indexes.resize(n_indx_size);
   for (int32 indx = 0; indx < n_indx_size; indx++) {
diff --git a/src/xvector/xvector.cc b/src/xvector/xvector.cc
index 604d70e9c14..aab825ba60b 100644
--- a/src/xvector/xvector.cc
+++ b/src/xvector/xvector.cc
@@ -40,6 +40,9 @@ void ComputeXvectorObjfAndDeriv(
     KALDI_ASSERT(deriv_xvector->NumCols() == xvector_dim);
     KALDI_ASSERT(deriv_xvector->NumRows() == N);
     KALDI_ASSERT(deriv_S->Dim() == S_dim);
+    deriv_xvector->Set(0.0);
+    deriv_S->Set(0.0);
+    (*deriv_b) = 0.0;
   }
 
   CuMatrix<BaseFloat> S_tmp(S),