diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index a52c42cf347..91d9bd51c4c 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -5,6 +5,7 @@
 //                2013  Hainan Xu
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
+//                2016  David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -179,6 +180,11 @@ void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
                               const float *mat2, float *mask, MatrixDim mat1_dim,
                               int mat2_stride, int mask_stride);
 
+void cudaF_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores,
+                               MatrixDim scores_dim, float *obfj_terms,
+                               MatrixDim objf_dim, float *objf_derivs,
+                               MatrixDim derivs_dim);
+
 /*********************************************************
  * double CUDA kernel calls
  */
@@ -302,6 +308,10 @@ void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim
 void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
 void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
 void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
+void cudaD_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores,
+                               MatrixDim scores_dim, double *obfj_terms,
+                               MatrixDim objf_dim, double *objf_derivs,
+                               MatrixDim derivs_dim);
 
 
 // some mostly mixed-type kernels.
@@ -349,8 +359,6 @@ void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
                               const double *mat2, double *mask, MatrixDim mat1_dim,
                               int mat2_stride, int mask_stride);
 
-
-
 } // extern "C"
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index d494be4169a..00d4f8bebf8 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -6,6 +6,7 @@
 //                2013  Hainan Xu
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
+//                2016  David Snyder
 
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -2094,6 +2095,26 @@ static void _diff_xent(const int32_cuda* vec_tgt, Real* mat_net_out, Real* vec_l
   }
 }
 
+template<typename Real>
+__global__
+static void _compute_xvector_objf(const Real* scores, MatrixDim scores_dim,
+                                  Real* obfj_terms, MatrixDim objf_dim, 
+                                  Real* obfj_derivs, MatrixDim derivs_dim) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda scores_index = i + j * scores_dim.stride;
+  Real K = 1.0 / (scores_dim.rows - 2.0);
+  Real L = scores[scores_index];
+  if (i < scores_dim.cols && j < scores_dim.rows && i < j) {
+    if (i + 1 == j && i % 2 == 0) {
+      obfj_terms[scores_index] = log(1.0 + exp(-L));
+      obfj_derivs[scores_index] = 1.0 / (1.0 + exp(L));
+    } else if (i != j) {
+      obfj_terms[scores_index] = K * log(1.0 + exp(L));
+      obfj_derivs[scores_index] = -K / (1.0 + exp(-L));
+    }
+  }
+}
 
 
 /***********************************************************************
@@ -2575,6 +2596,14 @@ void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
   _equal_element_mask<<<Gr,Bl>>>(mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
+void cudaF_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores, 
+                                MatrixDim scores_dim, float *objf_terms, 
+                                MatrixDim objf_dim, float *objf_derivs, 
+                                MatrixDim derivs_dim) {
+  _compute_xvector_objf<<<Gr,Bl>>>(scores, scores_dim, objf_terms, objf_dim,
+    objf_derivs, derivs_dim);
+}
+
 /*
  * "double"
  */
@@ -3029,6 +3058,13 @@ void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
   _equal_element_mask<<<Gr,Bl>>>(mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
+void cudaD_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores, 
+                                MatrixDim scores_dim, double *objf_terms, 
+                                MatrixDim objf_dim, double *objf_derivs, 
+                                MatrixDim derivs_dim) {
+  _compute_xvector_objf<<<Gr,Bl>>>(scores, scores_dim, objf_terms, objf_dim,
+    objf_derivs, derivs_dim);
+}
 
 
 /* Some conversion kernels for which it's more convenient to not name them F or D. */
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 0ded2f794d3..a6e6d2f4961 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -289,6 +289,13 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const f
   cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
+inline void cuda_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores,
+                               MatrixDim scores_dim, float *obfj_terms,
+                               MatrixDim objf_dim, float *objf_derivs,
+                               MatrixDim derivs_dim) {
+  cudaF_compute_xvector_objf(Gr, Bl, scores, scores_dim, obfj_terms, objf_dim,
+                          objf_derivs, derivs_dim);
+}
 
 
 // double versions
@@ -467,6 +474,14 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const
   cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
+inline void cuda_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores,
+                               MatrixDim scores_dim, double *obfj_terms,
+                               MatrixDim objf_dim, double *objf_derivs,
+                               MatrixDim derivs_dim) {
+  cudaD_compute_xvector_objf(Gr, Bl, scores, scores_dim, obfj_terms, objf_dim,
+                          objf_derivs, derivs_dim);
+}
+
 // Also include some template-friendly wrappers of cublas functions:
 inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha, const float *x, int incx, float *y, int incy) {
   return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 97757ba68dd..d5e9cfb6ef8 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2012  Karel Vesely
 //                      Johns Hopkins University (author: Daniel Povey)
+//           2016       David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -29,15 +30,15 @@ namespace kaldi {
 namespace cu {
 
 /*
- * templated functions wrapping the ANSI-C CUDA kernel functions 
+ * templated functions wrapping the ANSI-C CUDA kernel functions
  */
 
 
 template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
   KALDI_ASSERT(SameDim(*weight, *grad));
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
@@ -46,7 +47,7 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
                        weight->Dim(), grad->Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -55,11 +56,11 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     MatrixBase<Real> &grad2 = grad->Mat();
     for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
       for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
-        
+
         if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
 
         Real l1_signed = l1;
-        if (weight2(r, c) < 0.0) 
+        if (weight2(r, c) < 0.0)
           l1_signed = -l1;
 
         Real before = weight2(r, c);
@@ -88,16 +89,16 @@ void Randomize(const CuMatrixBase<Real> &src,
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     /*
-    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 
+    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
     */
 
     /*
      * Let's use blocksize 4 x 128 (512 threads/block)
-     * and extend the randomizable matrices to: col 4*65535, row 128*65535 
+     * and extend the randomizable matrices to: col 4*65535, row 128*65535
      * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
      */
     dim3 dimBlock(4, 128);
@@ -111,7 +112,7 @@ void Randomize(const CuMatrixBase<Real> &src,
     cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
                    copy_from_idx.Data(), dimtgt, dimsrc);
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -124,28 +125,28 @@ void Randomize(const CuMatrixBase<Real> &src,
       tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
     }
   }
-} 
+}
 
 
 
 template<typename Real>
 void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
             CuMatrixBase<Real> *tgt) {
-  
+
   KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
                 frame_offsets.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -171,7 +172,7 @@ void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
 
 template<typename Real>
 void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices,
-          CuMatrixBase<Real> *tgt) { 
+          CuMatrixBase<Real> *tgt) {
 
   KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
@@ -179,14 +180,14 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
               copy_from_indices.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -205,6 +206,31 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices
   }
 }
 
+template<typename Real>
+void ComputeXvectorObjfFromScores(const CuMatrixBase<Real> &scores,
+                                  CuMatrixBase<Real> *objf_terms,
+                                  CuMatrixBase<Real> *objf_derivs) {
+  #if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(scores.NumCols(), CU2DBLOCK),
+                 n_blocks(scores.NumRows(), CU2DBLOCK));
+
+    cuda_compute_xvector_objf(dimGrid, dimBlock, scores.Data(), scores.Dim(),
+      objf_terms->Data(), objf_terms->Dim(), objf_derivs->Data(),
+      objf_derivs->Dim());
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+  #endif
+  {
+    // TODO: Add the CPU version.
+    KALDI_LOG << "NOT USING CUDA";
+  }
+}
+
 // instantiate the templates.
 template
 void RegularizeL1(CuMatrixBase<float> *weight, CuMatrixBase<float> *grad, float l1, float lr);
@@ -233,6 +259,15 @@ void Randomize(const CuMatrixBase<double> &src,
                const CuArray<int32> &copy_from_idx,
                CuMatrixBase<double> *tgt);
 
+template
+void ComputeXvectorObjfFromScores(const CuMatrixBase<float> &scores,
+                                  CuMatrixBase<float> *objf_terms,
+                                  CuMatrixBase<float> *objf_derivs);
+template
+void ComputeXvectorObjfFromScores(const CuMatrixBase<double> &scores,
+                                  CuMatrixBase<double> *objf_terms,
+                                  CuMatrixBase<double> *objf_derivs);
+
 
 
 } //namespace cu
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 65a4c0c4af3..a30cec5d9df 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -1,7 +1,8 @@
 // cudamatrix/cu-math.h
 
 // Copyright 2009-2012  Karel Vesely
-//                2013  Johns Hopkins University (Author: David Snyder)
+//                2013  Johns Hopkins University (Author: Daniel Povey)
+//                2016  David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -78,7 +79,13 @@ void Group2norm(const CuMatrixBase<Real> &src,
                 CuMatrixBase<Real> *dest,
                 int32 group_stride);
 
-
+/*
+TODO: Documentation.
+*/
+template <typename BaseFloat>
+void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
+                                  CuMatrixBase<BaseFloat> *objf_terms,
+                                  CuMatrixBase<BaseFloat> *objf_derivs);
 
 
 } // namespace cu
diff --git a/src/ivector/Makefile b/src/ivector/Makefile
index 918bd4ef113..7bd8c1ce351 100644
--- a/src/ivector/Makefile
+++ b/src/ivector/Makefile
@@ -5,14 +5,18 @@ OPENFST_CXXFLAGS =
 OPENFST_LDLIBS = 
 include ../kaldi.mk
 
-TESTFILES = ivector-extractor-test plda-test logistic-regression-test
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
 
-OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o logistic-regression.o
+TESTFILES = ivector-extractor-test plda-test logistic-regression-test xvector-test
+
+OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o logistic-regression.o xvector.o
 
 LIBNAME = kaldi-ivector
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
-		../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-        ../util/kaldi-util.a 
+          ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
+          ../util/kaldi-util.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/ivector/xvector-test.cc b/src/ivector/xvector-test.cc
new file mode 100644
index 00000000000..229863e820a
--- /dev/null
+++ b/src/ivector/xvector-test.cc
@@ -0,0 +1,299 @@
+// ivector/xvector-test.cc
+
+// Copyright 2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ivector/xvector.h"
+#include "util/kaldi-io.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+namespace kaldi {
+BaseFloat TestSimilarityScore(const CuVector<BaseFloat> &v,
+  const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
+  BaseFloat b);
+
+void TestGetDeriv(const CuVector<BaseFloat> &v,
+    const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, bool is_same, BaseFloat similarity_score,
+    CuVector<BaseFloat> *deriv_v, CuVector<BaseFloat> *deriv_w,
+    CuVector<BaseFloat> *deriv_S_and_b);
+
+void TestComputeXvectorObjfAndDeriv(
+    const CuMatrixBase<BaseFloat> &xvector_pairs,
+    const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
+    CuVector<BaseFloat> *deriv_S_and_b, BaseFloat *tot_objf,
+    BaseFloat *tot_weight);
+
+bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
+  int32 xvector_dim = RandInt(4, 30),
+        num_rows = 2 * RandInt(2, 10); // The number of rows must be even
+                                       // and greater than 2.
+  CuSpMatrix<BaseFloat> S(xvector_dim);
+  S.SetRandn();
+  // Necessary to keep the similarity scores from getting too large or small.
+  S.Scale(1.0e-01);
+  BaseFloat b = RandInt(-100, 100) / 10.0,
+            tot_weight,
+            tot_objf;
+  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
+  CuMatrix<BaseFloat> xvector_pairs(num_rows, xvector_dim, kSetZero),
+                      deriv_xvector(num_rows, xvector_dim, kSetZero);
+  CuVector<BaseFloat> deriv_S_and_b(S_dim + 1, kSetZero);
+  xvector_pairs.SetRandn();
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector,
+    &deriv_S_and_b, &tot_objf, &tot_weight);
+  CuVector<BaseFloat> deriv_xvector_vec(xvector_dim);
+
+  // Sum over the derivatives for xvector input.
+  deriv_xvector_vec.AddRowSumMat(1.0, deriv_xvector, 0.0);
+  BaseFloat l2_xvector = 0,
+            l2_S = 0,
+            l2_b = 0;
+
+  // Compare the xvector derivatives calculated above with a numerical
+  // approximation.
+  for (int32 i = 0; i < xvector_dim; i++) {
+    CuMatrix<BaseFloat> xvector_pairs_p(xvector_pairs);
+    CuMatrix<BaseFloat> xvector_pairs_n(xvector_pairs);
+    for (int32 j = 0; j < num_rows; j++) {
+      xvector_pairs_p(j, i) += perturb_delta;
+      xvector_pairs_n(j, i) += -perturb_delta;
+    }
+    CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
+    CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
+    BaseFloat tot_objf_p;
+    BaseFloat tot_objf_n;
+    ComputeXvectorObjfAndDeriv(xvector_pairs_p, S, b, &deriv_xvector_tmp,
+      &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
+    ComputeXvectorObjfAndDeriv(xvector_pairs_n, S, b, &deriv_xvector_tmp,
+      &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
+    BaseFloat delta = (tot_objf_p  - tot_objf_n)
+      * 1.0 / (2.0 * perturb_delta);
+    l2_xvector += pow(deriv_xvector_vec(i) - delta, 2);
+  }
+
+  // Compare the S derivative calculated above with a numerical
+  // approximation.
+  for (int32 i = 0; i < S_dim; i++) {
+    CuSpMatrix<BaseFloat> S_p(S);
+    CuSpMatrix<BaseFloat> S_n(S);
+    S_p.Data()[i] += perturb_delta;
+    S_n.Data()[i] -= perturb_delta;
+    CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
+    CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
+    BaseFloat tot_objf_p;
+    BaseFloat tot_objf_n;
+    ComputeXvectorObjfAndDeriv(xvector_pairs, S_p, b, &deriv_xvector_tmp,
+      &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
+    ComputeXvectorObjfAndDeriv(xvector_pairs, S_n, b, &deriv_xvector_tmp,
+      &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
+    BaseFloat delta = (tot_objf_p  - tot_objf_n)
+      * 1.0 / (2.0 * perturb_delta);
+    l2_S += pow(deriv_S_and_b(i) - delta, 2);
+  }
+
+  // Compare the b derivative calculated above with a numerical
+  // approximation.
+  BaseFloat b_p = b + perturb_delta;
+  BaseFloat b_n = b - perturb_delta;
+  CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
+  CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
+  BaseFloat tot_objf_p;
+  BaseFloat tot_objf_n;
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_p, &deriv_xvector_tmp,
+    &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_n, &deriv_xvector_tmp,
+    &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
+  BaseFloat delta = (tot_objf_p  - tot_objf_n) * 1.0 / (2.0 * perturb_delta);
+  l2_b = pow(deriv_S_and_b(S_dim) - delta, 2);
+  KALDI_ASSERT(l2_xvector < 1.0e-03);
+  KALDI_ASSERT(l2_S <  1.0e-03);
+  KALDI_ASSERT(l2_b < 1.0e-03);
+  return true;
+}
+
+bool TestXvectorComputeObjf() {
+  int32 xvector_dim = RandInt(4, 30),
+        num_rows = 2 * RandInt(2, 10); // The number of rows must be even
+                                       // and greater than 2.
+  CuSpMatrix<BaseFloat> S(xvector_dim);
+  S.SetRandn();
+  // Necessary to keep the similarity scores from getting too large or small.
+  S.Scale(1.0e-01);
+  BaseFloat b = RandInt(-200, 200) / 10.0,
+            tot_weight,
+            tot_weight_test,
+            tot_objf,
+            tot_objf_test;
+  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
+  CuMatrix<BaseFloat> xvector_pairs(num_rows, xvector_dim, kSetZero),
+                      deriv_xvector(num_rows, xvector_dim, kSetZero),
+                      deriv_xvector_test(num_rows, xvector_dim, kSetZero);
+  CuVector<BaseFloat> deriv_S_and_b(S_dim + 1, kSetZero),
+                      deriv_S_and_b_test(S_dim + 1, kSetZero);
+  xvector_pairs.SetRandn();
+
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector,
+    &deriv_S_and_b, &tot_objf, &tot_weight);
+  TestComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector_test,
+    &deriv_S_and_b_test, &tot_objf_test, &tot_weight_test);
+
+  CuVector<BaseFloat> deriv_xvector_vec(xvector_dim);
+  deriv_xvector_vec.AddRowSumMat(1.0, deriv_xvector, 0.0);
+  CuVector<BaseFloat> deriv_xvector_vec_test(xvector_dim);
+  deriv_xvector_vec_test.AddRowSumMat(1.0, deriv_xvector_test, 0.0);
+
+  // Verify that the objfs are the same.
+  KALDI_ASSERT(ApproxEqual(tot_objf, tot_objf_test, 0.001));
+  // Also verify that the gradients are the same.
+  for (int32 i = 0; i < deriv_xvector_vec.Dim(); i++)
+    KALDI_ASSERT(ApproxEqual(deriv_xvector_vec(i), deriv_xvector_vec_test(i), 0.001));
+  for (int32 i = 0; i < deriv_S_and_b.Dim(); i++)
+    KALDI_ASSERT(ApproxEqual(deriv_S_and_b(i), deriv_S_and_b_test(i), 0.001));
+  return true;
+}
+
+void TestComputeXvectorObjfAndDeriv(
+    const CuMatrixBase<BaseFloat> &xvector_pairs,
+    const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
+    CuVector<BaseFloat> *deriv_S_and_b, BaseFloat *tot_objf,
+    BaseFloat *tot_weight) {
+
+  int32 N = xvector_pairs.NumRows();
+  BaseFloat same_objf = 0,
+            diff_objf = 0;
+  BaseFloat K = 1.0 / (N - 2.0);
+  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
+  CuMatrix<BaseFloat> tmp_deriv(N, xvector_pairs.NumCols()
+                                + S_dim + 1, kSetZero);
+  // Handle portion of the objf corresponding to pairs of xvectors
+  // from the same classes.
+  for (int32 i = 0; i < N/2; i++) {
+    const CuVector<BaseFloat> &v(xvector_pairs.Row(2 * i)),
+                              &w(xvector_pairs.Row(2 * i + 1));
+    CuVector<BaseFloat> deriv_v,
+                        deriv_w,
+                        deriv_S_and_b_part;
+    BaseFloat similarity_score = TestSimilarityScore(v, w, S, b);
+    same_objf += Log(1 + Exp(-similarity_score));
+    TestGetDeriv(v, w, S, b, true, similarity_score, &deriv_v,
+     &deriv_w, &deriv_S_and_b_part);
+    deriv_xvector->Row(2 * i).AddVec(1.0, deriv_v);
+    deriv_xvector->Row(2 * i + 1).AddVec(1.0, deriv_w);
+    deriv_S_and_b->AddVec(1.0, deriv_S_and_b_part);
+  }
+
+  // Handle portion of the objf corresponding to pairs of xvectors
+  // from different classes.
+  for (int32 i = 0; i < N; i++) {
+    for (int32 j = 2 * std::ceil((i + 1) / 2.0); j < N; j++) {
+      const CuVector<BaseFloat> &v(xvector_pairs.Row(i)),
+                                &w(xvector_pairs.Row(j));
+      CuVector<BaseFloat> deriv_v,
+                          deriv_w,
+                          deriv_S_and_b_part;
+      BaseFloat similarity_score = TestSimilarityScore(v, w, S, b);
+      diff_objf += Log(1 + Exp(similarity_score));
+      TestGetDeriv(v, w, S, b, false, similarity_score, &deriv_v,
+        &deriv_w, &deriv_S_and_b_part);
+      deriv_xvector->Row(i).AddVec(K, deriv_v);
+      deriv_xvector->Row(j).AddVec(K, deriv_w);
+      deriv_S_and_b->AddVec(K, deriv_S_and_b_part);
+    }
+  }
+  // Scale the same and different portions of the objective function
+  // so that both contribute a weight of N.
+  (*tot_objf) = same_objf + K * diff_objf;
+  (*tot_weight) = N;
+}
+
+
+void TestGetDeriv(const CuVector<BaseFloat> &v,
+    const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, bool is_same, BaseFloat similarity_score,
+    CuVector<BaseFloat> *deriv_v, CuVector<BaseFloat> *deriv_w,
+    CuVector<BaseFloat> *deriv_S_and_b) {
+  int32 d = is_same ? 1 : -1,
+        S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
+  deriv_v->Resize(v.Dim(), kSetZero);
+  deriv_w->Resize(v.Dim(), kSetZero);
+  deriv_S_and_b->Resize(S_dim + 1, kSetZero);
+
+  // This scalar is common to the different derivatives.
+  BaseFloat deriv_coef = d * Exp(-1 * d * similarity_score)
+    / (1 + Exp(-1 * d * similarity_score));
+
+  // Handle derivative with respect to v and w.
+  deriv_v->CopyFromVec(w);
+  deriv_w->CopyFromVec(v);
+  deriv_v->AddSpVec(2.0, S, v, -1.0);
+  deriv_w->AddSpVec(2.0, S, w, -1.0);
+  deriv_v->Scale(deriv_coef);
+  deriv_w->Scale(deriv_coef);
+
+  // Handle derivative with respect to S.
+  CuSpMatrix<BaseFloat> deriv_S_mat(S.NumCols(), kSetZero);
+  deriv_S_mat.AddVec2(2.0, v);
+  deriv_S_mat.AddVec2(2.0, w);
+  for (int32 i = 0; i < S.NumCols(); i++)
+    deriv_S_mat(i, i) = 0.5 * deriv_S_mat(i, i);
+  CuSubVector<BaseFloat> deriv_S_vec(deriv_S_mat.Data(), S_dim);
+  CuSubVector<BaseFloat> sub_deriv_S_and_b(*deriv_S_and_b, 0, S_dim);
+  sub_deriv_S_and_b.AddVec(deriv_coef, deriv_S_vec);
+
+  // Handle derivative with respect to b.
+  (*deriv_S_and_b)(S_dim) = -deriv_coef;
+}
+
+BaseFloat TestSimilarityScore(const CuVector<BaseFloat> &v,
+  const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
+  BaseFloat b) {
+  CuVector<BaseFloat> Sv(v.Dim());
+  Sv.AddSpVec(1.0, S, v, 0);
+  CuVector<BaseFloat> Sw(w.Dim());
+  Sw.AddSpVec(1.0, S, w, 0);
+  BaseFloat L = VecVec(v, w) - VecVec(v, Sv) - VecVec(w, Sw) + b;
+  return L;
+}
+
+void UnitTestXvectorExtractor() {
+  if (!TestXvectorComputeObjf())
+    KALDI_ERR << "Xvector objf test failed";
+  if (!TestXvectorExtractorDerivative(1.0e-02) &&
+     !TestXvectorExtractorDerivative(1.0e-03) &&
+     !TestXvectorExtractorDerivative(1.0e-04) &&
+     !TestXvectorExtractorDerivative(1.0e-05))
+    KALDI_ERR << "Xvector derivative test failed";
+}
+
+} // namespace kaldi
+
+int main() {
+  using namespace kaldi;
+  for (int32 i = 0; i < 2; i++)
+#if HAVE_CUDA == 1
+    if (i == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId("yes"); // -1 means no GPU
+#endif
+    UnitTestXvectorExtractor();
+  std::cout << "Xvector tests succeeded.\n";
+  return 0;
+}
diff --git a/src/ivector/xvector.cc b/src/ivector/xvector.cc
new file mode 100644
index 00000000000..a6e8533b611
--- /dev/null
+++ b/src/ivector/xvector.cc
@@ -0,0 +1,78 @@
+// ivector/xvector.cc
+
+// Copyright 2016  David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ivector/xvector.h"
+
+namespace kaldi {
+
+void ComputeXvectorObjfAndDeriv(
+    const CuMatrixBase<BaseFloat> &xvector_pairs,
+    const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
+    CuVector<BaseFloat> *deriv_S_and_b, BaseFloat *tot_objf,
+    BaseFloat *tot_weight) {
+
+  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2,
+        N = xvector_pairs.NumRows(),
+        xvector_dim = xvector_pairs.NumCols();
+  BaseFloat K = 1.0 / (N - 2.0);
+  (*tot_objf) = 0;
+
+  if (deriv_xvector == NULL)
+    KALDI_ASSERT(deriv_S_and_b == NULL);
+  else {
+    KALDI_ASSERT(deriv_xvector->NumCols() == xvector_dim);
+    KALDI_ASSERT(deriv_xvector->NumRows() == N);
+    KALDI_ASSERT(deriv_S_and_b->Dim() == S_dim + 1);
+  }
+
+  CuMatrix<BaseFloat> S_tmp(S);
+  CuMatrix<BaseFloat> P(N, xvector_dim),
+                      Q(N, N),
+                      R(N, N),
+                      T(N, N),
+                      objf_terms(N, N),
+                      objf_deriv_terms(N, N);
+
+  CuVector<BaseFloat> r(N);
+  P.AddMatMat(1.0, xvector_pairs, kNoTrans, S_tmp, kNoTrans, 0.0);
+  r.AddDiagMatMat(1.0, xvector_pairs, kNoTrans, P, kTrans, 0.0);
+  R.AddVecToRows(1.0, r);
+  Q.SymAddMat2(1.0, xvector_pairs, kNoTrans, 0.0);
+  Q.CopyLowerToUpper();
+  T.AddMat(1.0, Q, kNoTrans);
+  T.AddMat(-1.0, R, kTrans);
+  T.AddMat(-1.0, R, kNoTrans);
+  T.Add(b);
+
+  cu::ComputeXvectorObjfFromScores<BaseFloat>(T, &objf_terms, &objf_deriv_terms);
+  CuVector<BaseFloat> objf_terms_vec(N);
+  objf_terms_vec.AddRowSumMat(1.0, objf_terms);
+  (*tot_objf) = objf_terms_vec.Sum();
+
+  if (deriv_xvector != NULL) {
+    /* TODO: Call cu-math function that handles the derivatives of S
+       and the xvectors.
+    */
+    (*deriv_S_and_b)(S_dim) = -objf_deriv_terms.Sum();
+  }
+  (*tot_weight) = N;
+}
+
+} // namespace kaldi
diff --git a/src/ivector/xvector.h b/src/ivector/xvector.h
new file mode 100644
index 00000000000..ddb05c632d7
--- /dev/null
+++ b/src/ivector/xvector.h
@@ -0,0 +1,74 @@
+// ivector/xvector.h
+
+// Copyright 2016    Johns Hopkins University (Author: Daniel Povey)
+//           2016    David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_IVECTOR_XVECTOR_H_
+#define KALDI_IVECTOR_XVECTOR_H_
+
+#include <vector>
+#include "base/kaldi-common.h"
+#include "cudamatrix/cu-matrix-lib.h"
+#include "itf/options-itf.h"
+#include "util/common-utils.h"
+#include "matrix/matrix-lib.h"
+
+namespace kaldi {
+  /*
+  Computes the training objective function and the derivatives for
+  the xvector.  Let N = xvector_pairs.NumRows() be the number of
+  xvectors. There are N(N-1)/2 pairs in total and N from the same
+  class. Let v(n) be the n'th row of the matrix xvector_pairs.
+  The total objective function written to 'tot_objf' is
+      \sum_{n=0}^{N/2} p_same(v(n*2), v(n*2+1))
+      + 1/(N-2) \sum_{n=0}^{N} \sum_{m=2*ceil(n+1)/2)}^{N}
+      p_different(v(m), v(n))
+  and let N be the normalizer for the objective function, written to
+  'tot_weight' and equal to the total (weighted) number of samples over
+  which the objective function is computed. It is useful for displaying
+  the objective function correctly.
+  Let the log-odds L(v,w) [interpreted as log(p_same(v,w) / p_different(v,w))]
+  be defined as:
+      L(v, w) = v' w -  v' S v - w' S w
+  then p_same(v, w) = -log(1 + exp(-l(v, w)), and
+  p_different(v, w) = 1 - p_same(v, w) = -log(1 + exp(-l(v, w)).
+
+  @param [in] xvector_pairs   Each row of 'xvector_pairs' is an xvector
+  extracted by the network for one sample, and the assumption is that
+  pairs of the form (2*k, 2*k+1), e.g., (0, 1), (2, 3), (4, 5), etc,
+  are from the same class, but any other pairs, e.g., (0, 2), (1, 2),
+  (2, 4), etc, are from different classes.
+  @param [out] deriv_xvector  If non-NULL, the derivative of the objective
+  function with respect to the xvectors is written here.
+  @param [out] deriv_S_and_b  If non-NULL, the derivative of the objective
+  function with respect to the parameters S and b are written here.
+  @param [out] tot_objf  The total objective function described above
+  @param [out] tot_weight  The total normalizing factor for the objective
+  function, equal to dvector_pairs.NumRows().
+  */
+  void ComputeXvectorObjfAndDeriv(const CuMatrixBase<BaseFloat> &dvector_pairs,
+    const CuSpMatrix<BaseFloat> &S,
+    BaseFloat b,
+    CuMatrixBase<BaseFloat> *deriv_xvector,
+    CuVector<BaseFloat> *deriv_S_and_b,
+    BaseFloat *tot_objf,
+    BaseFloat *tot_weight);
+
+}  // namespace kaldi
+
+#endif