danpovey · danpovey · Feb 13, 2016 · Feb 12, 2016 · Feb 12, 2016 · Feb 12, 2016
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
@@ -5,6 +5,7 @@
 //                2013  Hainan Xu
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
+//                2016  David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -179,6 +180,11 @@ void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
                               const float *mat2, float *mask, MatrixDim mat1_dim,
                               int mat2_stride, int mask_stride);
 
+void cudaF_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores,
+                               MatrixDim scores_dim, float *obfj_terms,
+                               MatrixDim objf_dim, float *objf_derivs,
+                               MatrixDim derivs_dim);
+
 /*********************************************************
  * double CUDA kernel calls
  */
@@ -302,6 +308,10 @@ void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim
 void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
 void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
 void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
+void cudaD_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores,
+                               MatrixDim scores_dim, double *obfj_terms,
+                               MatrixDim objf_dim, double *objf_derivs,
+                               MatrixDim derivs_dim);
 
 
 // some mostly mixed-type kernels.
@@ -349,8 +359,6 @@ void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
                               const double *mat2, double *mask, MatrixDim mat1_dim,
                               int mat2_stride, int mask_stride);
 
-
-
 } // extern "C"
 
 #endif // HAVE_CUDA

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
@@ -6,6 +6,7 @@
 //                2013  Hainan Xu
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
+//                2016  David Snyder
 
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -2094,6 +2095,26 @@ static void _diff_xent(const int32_cuda* vec_tgt, Real* mat_net_out, Real* vec_l
   }
 }
 
+template<typename Real>
+__global__
+static void _compute_xvector_objf(const Real* scores, MatrixDim scores_dim,
+                                  Real* obfj_terms, MatrixDim objf_dim, 
+                                  Real* obfj_derivs, MatrixDim derivs_dim) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda scores_index = i + j * scores_dim.stride;
+  Real K = 1.0 / (scores_dim.rows - 2.0);
+  Real L = scores[scores_index];
+  if (i < scores_dim.cols && j < scores_dim.rows && i < j) {
+    if (i + 1 == j && i % 2 == 0) {
+      obfj_terms[scores_index] = log(1.0 + exp(-L));
+      obfj_derivs[scores_index] = 1.0 / (1.0 + exp(L));
+    } else if (i != j) {
+      obfj_terms[scores_index] = K * log(1.0 + exp(L));
+      obfj_derivs[scores_index] = -K / (1.0 + exp(-L));
+    }
+  }
+}
 
 
 /***********************************************************************
@@ -2575,6 +2596,14 @@ void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
   _equal_element_mask<<<Gr,Bl>>>(mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
+void cudaF_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores, 
+                                MatrixDim scores_dim, float *objf_terms, 
+                                MatrixDim objf_dim, float *objf_derivs, 
+                                MatrixDim derivs_dim) {
+  _compute_xvector_objf<<<Gr,Bl>>>(scores, scores_dim, objf_terms, objf_dim,
+    objf_derivs, derivs_dim);
+}
+
 /*
  * "double"
  */
@@ -3029,6 +3058,13 @@ void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
   _equal_element_mask<<<Gr,Bl>>>(mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
+void cudaD_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores, 
+                                MatrixDim scores_dim, double *objf_terms, 
+                                MatrixDim objf_dim, double *objf_derivs, 
+                                MatrixDim derivs_dim) {
+  _compute_xvector_objf<<<Gr,Bl>>>(scores, scores_dim, objf_terms, objf_dim,
+    objf_derivs, derivs_dim);
+}
 
 
 /* Some conversion kernels for which it's more convenient to not name them F or D. */

diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
@@ -289,6 +289,13 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const f
   cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
+inline void cuda_compute_xvector_objf(dim3 Gr, dim3 Bl, const float *scores,
+                               MatrixDim scores_dim, float *obfj_terms,
+                               MatrixDim objf_dim, float *objf_derivs,
+                               MatrixDim derivs_dim) {
+  cudaF_compute_xvector_objf(Gr, Bl, scores, scores_dim, obfj_terms, objf_dim,
+                          objf_derivs, derivs_dim);
+}
 
 
 // double versions
@@ -467,6 +474,14 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const
   cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
+inline void cuda_compute_xvector_objf(dim3 Gr, dim3 Bl, const double *scores,
+                               MatrixDim scores_dim, double *obfj_terms,
+                               MatrixDim objf_dim, double *objf_derivs,
+                               MatrixDim derivs_dim) {
+  cudaD_compute_xvector_objf(Gr, Bl, scores, scores_dim, obfj_terms, objf_dim,
+                          objf_derivs, derivs_dim);
+}
+
 // Also include some template-friendly wrappers of cublas functions:
 inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha, const float *x, int incx, float *y, int incy) {
   return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);

diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2012  Karel Vesely
 //                      Johns Hopkins University (author: Daniel Povey)
+//           2016       David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -29,15 +30,15 @@ namespace kaldi {
 namespace cu {
 
 /*
- * templated functions wrapping the ANSI-C CUDA kernel functions 
+ * templated functions wrapping the ANSI-C CUDA kernel functions
  */
 
 
 template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
   KALDI_ASSERT(SameDim(*weight, *grad));
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
@@ -46,7 +47,7 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
                        weight->Dim(), grad->Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -55,11 +56,11 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     MatrixBase<Real> &grad2 = grad->Mat();
     for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
       for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
-        
+
         if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
 
         Real l1_signed = l1;
-        if (weight2(r, c) < 0.0) 
+        if (weight2(r, c) < 0.0)
           l1_signed = -l1;
 
         Real before = weight2(r, c);
@@ -88,16 +89,16 @@ void Randomize(const CuMatrixBase<Real> &src,
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     /*
-    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 
+    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
     */
 
     /*
      * Let's use blocksize 4 x 128 (512 threads/block)
-     * and extend the randomizable matrices to: col 4*65535, row 128*65535 
+     * and extend the randomizable matrices to: col 4*65535, row 128*65535
      * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
      */
     dim3 dimBlock(4, 128);
@@ -111,7 +112,7 @@ void Randomize(const CuMatrixBase<Real> &src,
     cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
                    copy_from_idx.Data(), dimtgt, dimsrc);
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -124,28 +125,28 @@ void Randomize(const CuMatrixBase<Real> &src,
       tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
     }
   }
-} 
+}
 
 
 
 template<typename Real>
 void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
             CuMatrixBase<Real> *tgt) {
-  
+
   KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
                 frame_offsets.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -171,22 +172,22 @@ void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
 
 template<typename Real>
 void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices,
-          CuMatrixBase<Real> *tgt) { 
+          CuMatrixBase<Real> *tgt) {
 
   KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
               copy_from_indices.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -205,6 +206,31 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices
   }
 }
 
+template<typename Real>
+void ComputeXvectorObjfFromScores(const CuMatrixBase<Real> &scores,
+                                  CuMatrixBase<Real> *objf_terms,
+                                  CuMatrixBase<Real> *objf_derivs) {
+  #if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(scores.NumCols(), CU2DBLOCK),
+                 n_blocks(scores.NumRows(), CU2DBLOCK));
+
+    cuda_compute_xvector_objf(dimGrid, dimBlock, scores.Data(), scores.Dim(),
+      objf_terms->Data(), objf_terms->Dim(), objf_derivs->Data(),
+      objf_derivs->Dim());
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+  #endif
+  {
+    // TODO: Add the CPU version.
+    KALDI_LOG << "NOT USING CUDA";
+  }
+}
+
 // instantiate the templates.
 template
 void RegularizeL1(CuMatrixBase<float> *weight, CuMatrixBase<float> *grad, float l1, float lr);
@@ -233,6 +259,15 @@ void Randomize(const CuMatrixBase<double> &src,
                const CuArray<int32> &copy_from_idx,
                CuMatrixBase<double> *tgt);
 
+template
+void ComputeXvectorObjfFromScores(const CuMatrixBase<float> &scores,
+                                  CuMatrixBase<float> *objf_terms,
+                                  CuMatrixBase<float> *objf_derivs);
+template
+void ComputeXvectorObjfFromScores(const CuMatrixBase<double> &scores,
+                                  CuMatrixBase<double> *objf_terms,
+                                  CuMatrixBase<double> *objf_derivs);
+
 
 
 } //namespace cu

diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
@@ -1,7 +1,8 @@
 // cudamatrix/cu-math.h
 
 // Copyright 2009-2012  Karel Vesely
-//                2013  Johns Hopkins University (Author: David Snyder)
+//                2013  Johns Hopkins University (Author: Daniel Povey)
+//                2016  David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -78,7 +79,13 @@ void Group2norm(const CuMatrixBase<Real> &src,
                 CuMatrixBase<Real> *dest,
                 int32 group_stride);
 
-
+/*
+TODO: Documentation.
+*/
+template <typename BaseFloat>
+void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
+                                  CuMatrixBase<BaseFloat> *objf_terms,
+                                  CuMatrixBase<BaseFloat> *objf_derivs);
 
 
 } // namespace cu

diff --git a/src/ivector/Makefile b/src/ivector/Makefile
@@ -5,14 +5,18 @@ OPENFST_CXXFLAGS =
 OPENFST_LDLIBS = 
 include ../kaldi.mk
 
-TESTFILES = ivector-extractor-test plda-test logistic-regression-test
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
 
-OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o logistic-regression.o
+TESTFILES = ivector-extractor-test plda-test logistic-regression-test xvector-test
+
+OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o logistic-regression.o xvector.o
 
 LIBNAME = kaldi-ivector
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
-		../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-        ../util/kaldi-util.a 
+          ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
+          ../util/kaldi-util.a
 
 include ../makefiles/default_rules.mk