diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 41288b411c0..3b4fd527ad6 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2098,20 +2098,25 @@ static void _diff_xent(const int32_cuda* vec_tgt, Real* mat_net_out, Real* vec_l
 template<typename Real>
 __global__
 static void _compute_xvector_objf(const Real* scores, MatrixDim scores_dim,
-                                  Real* obfj_terms, MatrixDim objf_dim, 
-                                  Real* obfj_derivs, MatrixDim derivs_dim) {
+                                  Real* objf_terms, MatrixDim objf_dim, 
+                                  Real* objf_derivs, MatrixDim derivs_dim) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda scores_index = i + j * scores_dim.stride;
+  int32_cuda objf_index = i + j * objf_dim.stride;
+  int32_cuda derivs_index = i + j * derivs_dim.stride;
   Real K = 1.0 / (scores_dim.rows - 2.0);
   Real L = scores[scores_index];
   if (i < scores_dim.cols && j < scores_dim.rows) {
     if (i + 1 == j && i % 2 == 0) {
-      obfj_terms[scores_index] = log(1.0 + exp(-L));
-      obfj_derivs[scores_index] = 1.0 / (1.0 + exp(L));
+      objf_terms[objf_index] = L < -15 ? -L : log(1.0 + exp(-L));
+      objf_derivs[derivs_index] = 1.0 / (1.0 + exp(L));
     } else if (i < j) {
-      obfj_terms[scores_index] = K * log(1.0 + exp(L));
-      obfj_derivs[scores_index] = -K / (1.0 + exp(-L));
+      objf_terms[objf_index] = K * (L > 15 ? L : log(1.0 + exp(L)));
+      objf_derivs[derivs_index] = -K / (1.0 + exp(-L));
+    } else {
+      objf_terms[objf_index] = 0.0;
+      objf_derivs[derivs_index] = 0.0;
     }
   }
 }
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 005bb3146c0..e655dde8a0f 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -227,16 +227,21 @@ void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
   } else
   #endif
   {
+    // Compute the xvector objective function and its derivatives in the CPU.
     int32 num_rows = scores.NumRows();
     BaseFloat K = 1.0 / (num_rows - 2.0);
     for (int32 i = 0; i < num_rows; i++) {
-      for (int32 j = i + 1; j < num_rows; j++) {
+      for (int32 j = 0; j < num_rows; j++) {
+        BaseFloat L = scores(i, j);
         if (i + 1 == j && i % 2 == 0) {
-          (*objf_terms)(i, j) = log(1.0 + exp(-scores(i, j)));
-          (*objf_derivs)(i, j) = 1.0 / (1.0 + exp(scores(i, j)));
+          (*objf_terms)(i, j) = L < -15 ? -L : log(1.0 + exp(-L));
+          (*objf_derivs)(i, j) = 1.0 / (1.0 + exp(L));
+        } else if (i < j) {
+          (*objf_terms)(i, j) = K * (L > 15 ? L : log(1.0 + exp(L)));
+          (*objf_derivs)(i, j) = -K / (1.0 + exp(-L));
         } else {
-          (*objf_terms)(i, j) = K * log(1.0 + exp(scores(i, j)));
-          (*objf_derivs)(i, j) = -K / (1.0 + exp(-scores(i, j)));
+          (*objf_terms)(i, j) = 0;
+          (*objf_derivs)(i, j) = 0;
         }
       }
     }
diff --git a/src/ivector/xvector-test.cc b/src/ivector/xvector-test.cc
index ae3b6d7e57b..a63f0532a7e 100644
--- a/src/ivector/xvector-test.cc
+++ b/src/ivector/xvector-test.cc
@@ -40,7 +40,7 @@ void TestComputeXvectorObjfAndDeriv(
     BaseFloat *tot_weight);
 
 bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
-  int32 xvector_dim = RandInt(4, 50),
+  int32 xvector_dim = RandInt(4, 100),
         num_rows = 2 * RandInt(2, 10); // The number of rows must be even
                                        // and greater than 2.
   CuSpMatrix<BaseFloat> S(xvector_dim);
@@ -126,7 +126,7 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
 }
 
 bool TestXvectorComputeObjf() {
-  int32 xvector_dim = RandInt(4, 40),
+  int32 xvector_dim = RandInt(4, 100),
         num_rows = 2 * RandInt(2, 10); // The number of rows must be even
                                        // and greater than 2.
   CuSpMatrix<BaseFloat> S(xvector_dim);
diff --git a/src/ivector/xvector.cc b/src/ivector/xvector.cc
index c06942d1cb6..57a9c69d604 100644
--- a/src/ivector/xvector.cc
+++ b/src/ivector/xvector.cc
@@ -46,10 +46,10 @@ void ComputeXvectorObjfAndDeriv(
                       P(N, xvector_dim),
                       Q(N, N),
                       R(N, N),
-                      scores(N, N),           // The raw scores.
-                      objf_terms(N, N),
-                      objf_deriv_terms(N, N); // Derivative of the
-                                              // objf w.r.t. the scores.
+                      scores(N, N),                 // The raw scores.
+                      objf_terms(N, N, kUndefined),
+                      objf_deriv_terms(N, N,        // Derivative of the
+                                       kUndefined); // objf w.r.t. the scores.
   CuVector<BaseFloat> r(N);
 
   P.AddMatMat(1.0, xvector_pairs, kNoTrans, S_tmp, kNoTrans, 0.0);
diff --git a/src/ivector/xvector.h b/src/ivector/xvector.h
index 53d0864575a..9b4b79de54a 100644
--- a/src/ivector/xvector.h
+++ b/src/ivector/xvector.h
@@ -32,7 +32,7 @@ namespace kaldi {
   /*
   Computes the training objective function and the derivatives for
   the xvector.  Let N = xvector_pairs.NumRows() be the number of
-  xvectors. There are N(N-1)/2 pairs in total and N from the same
+  xvectors. There are N(N-1)/2 pairs in total and N/2 from the same
   class. Let v(n) be the n'th row of the matrix xvector_pairs.
   The total objective function written to 'tot_objf' is
       \sum_{n=0}^{N/2} p_same(v(n*2), v(n*2+1))
@@ -61,9 +61,9 @@ namespace kaldi {
   the objective function with respect to the parameter b is written here.
   @param [out] tot_objf  The total objective function described above
   @param [out] tot_weight  The total normalizing factor for the objective
-  function, equal to dvector_pairs.NumRows().
+  function, equal to xvector_pairs.NumRows().
   */
-  void ComputeXvectorObjfAndDeriv(const CuMatrixBase<BaseFloat> &dvector_pairs,
+  void ComputeXvectorObjfAndDeriv(const CuMatrixBase<BaseFloat> &xvector_pairs,
     const CuSpMatrix<BaseFloat> &S,
     BaseFloat b,
     CuMatrixBase<BaseFloat> *deriv_xvector,
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 0a57c17fad0..f8a02207b1c 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -12,7 +12,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \
    nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster \
    nnet3-copy nnet3-show-progress nnet3-align-compiled \
-   nnet3-get-egs-dense-targets nnet3-compute
+   nnet3-get-egs-dense-targets nnet3-compute nnet3-xvector-get-egs
 
 OBJFILES =
 
diff --git a/src/nnet3bin/nnet3-xvector-get-egs.cc b/src/nnet3bin/nnet3-xvector-get-egs.cc
new file mode 100644
index 00000000000..24e50560b54
--- /dev/null
+++ b/src/nnet3bin/nnet3-xvector-get-egs.cc
@@ -0,0 +1,240 @@
+// nnet3bin/nnet3-xvector-get-egs.cc
+
+// Copyright 2012-2016  Johns Hopkins University (author:  Daniel Povey)
+//                2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "util/common-utils.h"
+#include "nnet3/nnet-example.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// A struct for holding information about the position and
+// duration of each pair of chunks.
+struct ChunkPairInfo {
+  std::string pair_name;
+  int32 output_archive_id;
+  int32 start_frame1;
+  int32 start_frame2;
+  int32 num_frames1;
+  int32 num_frames2;
+};
+
+// Process the range input file and store it as a map from utterance
+// name to vector of ChunkPairInfo structs.
+static void ProcessRangeFile(const std::string &range_rxfilename,
+                             unordered_map<std::string,
+                             std::vector<ChunkPairInfo *> > *utt_to_pairs) {
+  Input range_input(range_rxfilename);
+  if (!range_rxfilename.empty()) {
+    std::string line;
+    while (std::getline(range_input.Stream(), line)) {
+      ChunkPairInfo *pair = new ChunkPairInfo();
+      std::vector<std::string> fields;
+      SplitStringToVector(line, " \t\n\r", true, &fields);
+      if (fields.size() != 6)
+        KALDI_ERR << "Expected 6 fields in line of range file, got "
+                  << fields.size() << " instead.";
+
+      std::string utt = fields[0],
+                  start_frame1_str = fields[2],
+                  num_frames1_str = fields[3],
+                  start_frame2_str = fields[4],
+                  num_frames2_str = fields[5];
+
+      if (!ConvertStringToInteger(fields[1], &(pair->output_archive_id))
+          || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1))
+          || !ConvertStringToInteger(start_frame2_str, &(pair->start_frame2))
+          || !ConvertStringToInteger(num_frames1_str, &(pair->num_frames1))
+          || !ConvertStringToInteger(num_frames2_str, &(pair->num_frames2)))
+        KALDI_ERR << "Expected integer for output archive in range file.";
+      pair->pair_name = utt + "-" + start_frame1_str + "-" + num_frames1_str
+                      + "-" + start_frame2_str + "-" + num_frames2_str;
+      unordered_map<std::string, std::vector<ChunkPairInfo*> >::iterator
+        got = utt_to_pairs->find(utt);
+      if (got == utt_to_pairs->end()) {
+        std::vector<ChunkPairInfo* > pairs;
+        pairs.push_back(pair);
+        utt_to_pairs->insert(std::make_pair<std::string,
+                             std::vector<ChunkPairInfo* > > (utt, pairs));
+      } else {
+        got->second.push_back(pair);
+      }
+    }
+  }
+}
+
+static void WriteExamples(const MatrixBase<BaseFloat> &feats,
+                          const std::vector<ChunkPairInfo *> &pairs,
+                          const std::string &utt,
+                          bool compress,
+                          int32 *num_egs_written,
+                          std::vector<NnetExampleWriter *> *example_writers) {
+  for (std::vector<ChunkPairInfo *>::const_iterator it = pairs.begin();
+      it != pairs.end(); ++it) {
+    ChunkPairInfo *pair = *it;
+    NnetExample eg;
+    int32 num_rows = feats.NumRows(),
+          feat_dim = feats.NumCols();
+    if (num_rows < std::max(pair->num_frames1, pair->num_frames2)) {
+      KALDI_WARN << "Unable to create examples for utterance " << utt
+                 << ". Requested chunk size of "
+                 << std::max(pair->num_frames1, pair->num_frames2)
+                 << " but utterance has only " << num_rows << " frames.";
+    } else {
+      // The requested chunk positions are approximate. It's possible
+      // that they slightly exceed the number of frames in the utterance.
+      // If that occurs, we can shift the chunks location back slightly.
+      int32 shift1 = std::min(0, num_rows - pair->start_frame1
+                                 - pair->num_frames1),
+            shift2 = std::min(0, num_rows - pair->start_frame2
+                                 - pair->num_frames2);
+      SubMatrix<BaseFloat> chunk1(feats, pair->start_frame1 + shift1,
+                                  pair->num_frames1, 0, feat_dim),
+                           chunk2(feats, pair->start_frame2 + shift2,
+                                  pair->num_frames2, 0, feat_dim);
+      NnetIo nnet_io1 = NnetIo("input1", 0, chunk1),
+             nnet_io2 = NnetIo("input2", 0, chunk2);
+      for (std::vector<Index>::iterator indx_it = nnet_io1.indexes.begin();
+          indx_it != nnet_io1.indexes.end(); ++indx_it)
+        indx_it->n = 0;
+      for (std::vector<Index>::iterator indx_it = nnet_io2.indexes.begin();
+          indx_it != nnet_io2.indexes.end(); ++indx_it)
+        indx_it->n = 1;
+
+      NnetExample eg;
+      eg.io.push_back(nnet_io1);
+      eg.io.push_back(nnet_io2);
+      if (compress)
+        eg.Compress();
+
+      if (pair->output_archive_id >= example_writers->size())
+        KALDI_ERR << "Requested output index exceeds number of specified "
+                  << "output files.";
+      (*example_writers)[pair->output_archive_id]->Write(
+                         pair->pair_name, eg);
+      (*num_egs_written) += 1;
+    }
+  }
+}
+
+// Delete the dynamically allocated memory.
+static void Cleanup(unordered_map<std::string,
+                    std::vector<ChunkPairInfo *> > *utt_to_pairs,
+                    std::vector<NnetExampleWriter *> *writers) {
+  for (unordered_map<std::string, std::vector<ChunkPairInfo*> >::iterator
+      map_it = utt_to_pairs->begin();
+      map_it != utt_to_pairs->end(); ++map_it)
+    for (std::vector<ChunkPairInfo*>::iterator
+        vec_it = map_it->second.begin(); vec_it != map_it->second.end();
+        ++vec_it)
+      delete *vec_it;
+  for (std::vector<NnetExampleWriter *>::iterator
+      it = writers->begin(); it != writers->end(); ++it)
+    delete *it;
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Get examples for training an nnet3 neural network for the xvector\n"
+        "system.  Each output example contains a pair of feature chunks from\n"
+        "the same utterance.  The location and length of the feature chunks\n"
+        "are specified in the 'ranges' file.  Each line is interpreted as\n"
+        "follows:\n"
+        "  <source-utterance> <output-archive-index>  <start-frame-index1>"
+        " <num-frames1> <start-frame-index2> <num-frames2>\n"
+        "For example:\n"
+        "  utt1  3   0   65  112  110\n"
+        "  utt1  0   160 50  214  180\n"
+        "  utt2  ...\n"
+        "\n"
+        "Usage:  nnet3-xvector-get-egs [options] <ranges-filename> "
+        "<features-rspecifier> <egs-0-out> <egs-1-out> ... <egs-N-1-out>\n"
+        "\n"
+        "For example:\n"
+        "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark"
+        "  ark:egs_temp.2.ark ark:egs_temp.3.ark\n";
+
+    bool compress = true;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string
+        range_rspecifier = po.GetArg(1),
+        feature_rspecifier = po.GetArg(2);
+    std::vector<NnetExampleWriter *> example_writers;
+
+    for (int32 i = 3; i <= po.NumArgs(); i++)
+      example_writers.push_back(new NnetExampleWriter(po.GetArg(i)));
+
+    unordered_map<std::string, std::vector<ChunkPairInfo *> > utt_to_pairs;
+    ProcessRangeFile(range_rspecifier, &utt_to_pairs);
+    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+
+    int32 num_done = 0,
+          num_err = 0,
+          num_egs_written = 0;
+
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string key = feat_reader.Key();
+      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      unordered_map<std::string, std::vector<ChunkPairInfo*> >::iterator
+        got = utt_to_pairs.find(key);
+      if (got == utt_to_pairs.end()) {
+        KALDI_WARN << "Could not create examples from utterance "
+                   << key << " because it has no entry in the ranges "
+                  <<  "input file.";
+        num_err++;
+      } else {
+        std::vector<ChunkPairInfo *> pairs = got->second;
+        WriteExamples(feats, pairs, key, compress, &num_egs_written,
+                      &example_writers);
+        num_done++;
+      }
+    }
+    Cleanup(&utt_to_pairs, &example_writers);
+
+    KALDI_LOG << "Finished generating examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}