From 672be6a715b59a6c01f745a039d758c8ecf325fe Mon Sep 17 00:00:00 2001
From: Tao Lv <tao.a.lv@intel.com>
Date: Thu, 18 Apr 2019 14:39:26 +0800
Subject: [PATCH] add functions

---
 src/operator/mkl_functions-inl.h        | 151 +++++++++++++++++++-----
 src/operator/tensor/elemwise_unary_op.h |  48 ++++----
 2 files changed, 146 insertions(+), 53 deletions(-)
diff --git a/src/operator/mkl_functions-inl.h b/src/operator/mkl_functions-inl.h
index f3615f4ad17e..b224d08ab126 100644
--- a/src/operator/mkl_functions-inl.h
+++ b/src/operator/mkl_functions-inl.h
@@ -44,36 +44,31 @@ static bool check_type(const int t) {
   return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
 }
 
-#define MXNET_MKL_UNARY_MATH_FUNC(name, func)                                          \
-  struct name : public mxnet_op::tunable {                                             \
-    template <typename DType>                                                          \
-    MSHADOW_XINLINE static void Map(const index_t n, const DType *src, float *dst) {   \
-      vs##func(static_cast<MKL_INT>(n), reinterpret_cast<const float *>(src), dst);    \
-    }                                                                                  \
-    MSHADOW_XINLINE static void Map(const index_t n, const double *src, double *dst) { \
-      vd##func(static_cast<MKL_INT>(n), src, dst);                                     \
-    }                                                                                  \
-  }
+#define MXNET_MKL_UNARY_MATH_FUNC(name, func)                                               \
+struct name {                                                                               \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const float *src, float *dst) {    \
+    vs##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const double *src, double *dst) {  \
+    vd##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+};
 
-#define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                         \
-  struct name : public mxnet_op::tunable {                                             \
-    template <typename DType>                                                          \
-    MSHADOW_XINLINE static void Map(const index_t n,                                   \
-                                    const DType *a,                                    \
-                                    const DType *b,                                    \
-                                    float *c) {                                        \
-      vs##func(static_cast<MKL_INT>(n),                                                \
-               reinterpret_cast<const float *>(a),                                     \
-               reinterpret_cast<const float *>(b),                                     \
-               c);                                                                     \
-    }                                                                                  \
-    MSHADOW_XINLINE static void Map(const index_t n,                                   \
-                                    const double *a,                                   \
-                                    const double *b,                                   \
-                                    double *c) {                                       \
-      vd##func(static_cast<MKL_INT>(n), a, b, c);                                      \
-    }                                                                                  \
-  }
+#define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                        \
+struct name {                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const float *a,                               \
+                                        const float *b,                               \
+                                        float *c) {                                   \
+    vs##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const double *a,                              \
+                                        const double *b,                              \
+                                        double *c) {                                  \
+    vd##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+};
 
 MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
 MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
@@ -118,6 +113,104 @@ MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
 MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);
 
 
+template <typename DType>
+MSHADOW_XINLINE static void sub_(index_t n, DType *in, DType b, DType *dst) {
+  for (index_t i = 0; i < n; i++)
+    dst[i] = in[i] - b;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void div_(index_t n, DType *in, DType b, DType *dst) {
+  for (index_t i = 0; i < n; i++)
+    dst[i] = in[i] / b;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void sum_(index_t n, DType *in, DType *dst) {
+  // dst[0] = cblas_sasum(n, in, 1);
+  DType sum = 0.0f;
+  for (index_t i = 0; i < n; i++)
+    sum += in[i];
+
+  dst[0] = sum;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void max_(int n, DType * __restrict__ in, DType *dst) {
+  dst[0] = in[0];
+  for (int i = 1; i < n; i++)
+    dst[0] = (dst[0] < in[i]) ? in[i] : dst[0];
+}
+
+// LayerNorm on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void LayerNormLastDim(const index_t m,
+                                             const index_t n,
+                                             const DType *a,
+                                             const DType *b,
+                                             const DType *ws,
+                                             const DType *gamma,
+                                             const DType *beta,
+                                             const DType *mean,
+                                             const DType *var,
+                                             const DType eps) {
+#pragma omp parallel for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+    DType* ws_offset = ws + i * n;
+
+    sum_(n, in_offset, &(mean[i]));
+    mean[i] /= n;
+    sub_(n, in_offset, mean[i], out_offset);
+    square(n, out_offset, ws_offset);
+    sum_(n, ws_offset, &(var[i]));
+    var[i] = sqrt(var[i] / n + eps);
+
+    mul(n, out_offset, gamma, out_offset);
+    div_(n, out_offset, var[i], out_offset);
+    add(n, out_offset, beta, out_offset);
+  }
+}
+
+// softmax on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void SoftmaxLastDim(const index_t m,
+                                           const index_t n,
+                                           const DType *a,
+                                           const DType *b) {
+#pragma omp paralle for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+
+    exp(n, in_offset, out_offset);
+    float sum = 0.0f;
+    sum_(n, out_offset, &sum);
+    div_(n, out_offset, sum, out_offset);
+  }
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void LogSoftmaxLastDim(const index_t m,
+                                              const index_t n,
+                                              const DType *a,
+                                              const DType *b) {
+#pragma parallel for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+
+    DType b, logsum;
+    max_(n, in_offset, &b);
+    sub_(n, in_offset, b, out_offset);
+    exp(n, out_offset, out_offset);
+    sum_(n, out_offset, &logsum);
+    logsum = b + logf(logsum);
+    sub_(n, in_offset, logsum, out_offset);
+  }
+}
+
 }  // namespace mkl_func
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 600803c953f6..d2d221bbd628 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -390,7 +390,7 @@ class UnaryOp : public OpBase {
         mkl_func::check_type(type_flag)) {
       // set DType as float or double according to type_flag
       MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-        MKL_OP::Map(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+        MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
       });
     } else {
       Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
@@ -562,7 +562,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   NNVM_REGISTER_OP(__name$)                                         \
   .set_num_inputs(1)                                                \
   .set_num_outputs(1)                                               \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
     [](const NodeAttrs& attrs){                                     \
@@ -578,7 +578,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   NNVM_REGISTER_OP(__name$)                                         \
   .set_num_inputs(1)                                                \
   .set_num_outputs(1)                                               \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
     [](const NodeAttrs& attrs){                                     \
@@ -591,27 +591,27 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
    *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
    *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
   */
-  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$)      \
-    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
-    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
-    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)      \
-    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
-    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
-
-  /*! \bried MKL Unary compute.
-   *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
-   *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
-  */
-  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)          \
-    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
-    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
-    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)     \
-    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
-    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
-
-  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)    \
-    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
-    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
+#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
+  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                      \
+  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                              \
+  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) \
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)    \
+  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
+
+/*! \bried MKL Unary compute.
+ *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+ *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+*/
+#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)     \
+  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                      \
+  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                              \
+  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)\
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)    \
+  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
+
+#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)\
+  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                       \
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
 #endif
 
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */