Merge pull request apache#67 from tqchen/master

Add vector dot
stefanhenneking · Oct 26, 2015 · 3b2a872 · 3b2a872
2 parents 28ffc0a + 7b4d869
commit 3b2a872
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 12 deletions.
diff --git a/guide/basic.cpp b/guide/basic.cpp
@@ -41,6 +41,7 @@ int main(void) {
   lhs = 1.0;
   rhs = 1.0;
   ret = implicit_dot(lhs, rhs.T());
+  printf("vdot=%f\n", VectorDot(lhs[0], rhs[0]));
   int cnt = 0;
   for (index_t i = 0; i < ret.size(0); ++i) {
     for (index_t j = 0; j < ret.size(1); ++j) {

diff --git a/mshadow/base.h b/mshadow/base.h
@@ -69,6 +69,7 @@
 #ifndef MSHADOW_USE_MKL
   #define MSHADOW_USE_MKL   1
 #endif
+
 /*!
  * \brief use CUDA support, must ensure that the cuda include path is correct,
  * or directly compile using nvcc

diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h
@@ -22,7 +22,7 @@ struct DotEngine {
 // handles the dot
 template<typename Device>
 struct BLASEngine;
-#if (MSHADOW_USE_CBLAS || MSHADOW_USE_MKL)
+#if (MSHADOW_USE_MKL || MSHADOW_USE_CBLAS)
 template<>
 struct BLASEngine<cpu> {
   inline static CBLAS_TRANSPOSE GetT(bool t) {
@@ -74,6 +74,18 @@ struct BLASEngine<cpu> {
                          const double *Y, int incY, double *A, int lda) {
     cblas_dger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda);
   }
+  inline static float dot(Stream<cpu> *stream,
+                          int n,
+                          const float* X, int incX,
+                          const float* Y, int incY) {
+    return cblas_sdot(n, X, incX, Y, incY);
+  }
+  inline static double dot(Stream<cpu> *stream,
+                           int n,
+                           const double* X, int incX,
+                           const double* Y, int incY) {
+    return cblas_ddot(n, X, incX, Y, incY);
+  }
 };
 #elif MSHADOW_STAND_ALONE == 1
 template<>
@@ -123,6 +135,18 @@ struct BLASEngine<cpu> {
                          const double *Y, int incY, double *A, int lda) {
     LOG(FATAL) << "Not implmented!";
   }
+  inline static void dot(Stream<cpu> *stream,
+                         int n,
+                         const float* X, int incX,
+                         const float* Y, int incY) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void dot(Stream<cpu> *stream,
+                         int n,
+                         const double* X, int incX,
+                         const double* Y, int incY) {
+    LOG(FATAL) << "Not implmented!";
+  }
 };
 #endif  // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL || MSHADOW_STAND_ALONE
 // CuBLAS redirect code
@@ -183,17 +207,37 @@ struct BLASEngine<gpu> {
                          const float *X, int incX,
                          const float *Y, int incY, float *A, int lda) {
     cublasStatus_t err = cublasSger(Stream<gpu>::GetBlasHandle(stream),
-               m, n, &alpha, X, incX, Y, incY, A, lda);
+                                    m, n, &alpha, X, incX, Y, incY, A, lda);
     CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Sger fail";
   }
   inline static void ger(Stream<gpu> *stream,
                          int m, int n, double alpha,
                          const double *X, int incX,
                          const double *Y, int incY, double *A, int lda) {
     cublasStatus_t err = cublasDger(Stream<gpu>::GetBlasHandle(stream),
-               m, n, &alpha, X, incX, Y, incY, A, lda);
+                                    m, n, &alpha, X, incX, Y, incY, A, lda);
     CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Dger fail";
   }
+  inline static float dot(Stream<gpu> *stream,
+                         int n,
+                         const float* X, int incX,
+                         const float* Y, int incY) {
+    float ret;
+    cublasStatus_t err = cublasSdot(Stream<gpu>::GetBlasHandle(stream),
+                                    n, X, incX, Y, incY, &ret);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Dot fail";
+    return ret;
+  }
+  inline static double dot(Stream<gpu> *stream,
+                           int n,
+                           const double* X, int incX,
+                           const double* Y, int incY) {
+    double ret;
+    cublasStatus_t err = cublasDdot(Stream<gpu>::GetBlasHandle(stream),
+                                    n, X, incX, Y, incY, &ret);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Dot fail";
+    return ret;
+  }
 };
 #endif  // MSHADOW_USE_CUDA
 // helper function to decide which shape we are in

diff --git a/mshadow/tensor.h b/mshadow/tensor.h
@@ -761,6 +761,16 @@ template<typename Saver, typename Reducer, int dimkeep,
 inline void MapReduceKeepHighDim(TRValue<R, gpu, 1, DType> *dst,
                                  const expr::Exp<E, DType, etype> &exp,
                                  DType scale = 1);
+
+/*!
+ * \brief CPU/GPU: 1 dimension vector dot
+ * \param lhs Left operand vector
+ * \param rhs right operand vector
+ * \return dot(lhs, rhs)
+ */
+template<typename Device, typename DType>
+inline DType VectorDot(const Tensor<Device, 1, DType> &lhs,
+                       const Tensor<Device, 1, DType> &rhs);
 }  // namespace mshadow
 // include headers
 #include "./stream_gpu-inl.h"

diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h
@@ -10,6 +10,7 @@
 #include "./base.h"
 #include "./tensor.h"
 #include "./packet-inl.h"
+#include "./dot_engine-inl.h"
 
 namespace mshadow {
 template<>
@@ -331,15 +332,13 @@ inline void Softmax(Tensor<cpu, 3, DType> dst,
   }
 }
 
-template<typename DType>
-inline DType VDot(const Tensor<cpu, 1, DType> &lhs,
-                  const Tensor<cpu, 1, DType> &rhs) {
-  CHECK_EQ(lhs.shape_, rhs.shape_) <<  "VDot: shape mismatch";
-  DType sum = static_cast<DType>(0);
-  for (index_t x = 0; x < lhs.size(0); ++x) {
-    sum += lhs[x] * rhs[x];
-  }
-  return sum;
+// blas related
+template<typename Device, typename DType>
+inline DType VectorDot(const Tensor<Device, 1, DType> &lhs,
+                       const Tensor<Device, 1, DType> &rhs) {
+  expr::BLASEngine<Device>::SetStream(lhs.stream_);
+  return  mshadow::expr::BLASEngine<Device>::dot(
+      lhs.stream_, lhs.size(0), lhs.dptr_, 1, rhs.dptr_, 1);
 }
 }  // namespace mshadow
 #endif  // MSHADOW_TENSOR_CPU_INL_H_