From c69a25c7ec5694a7002a3e44bc2e6821156246d4 Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Sat, 9 Mar 2019 16:16:55 +0800
Subject: [PATCH 1/5] Fix review comments

---
 src/operator/tensor/elemwise_unary_op.h       | 42 +++++++++----------
 .../tensor/elemwise_unary_op_basic.cc         | 35 ++++------------
 2 files changed, 30 insertions(+), 47 deletions(-)
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index d2d221bbd628..26938404018c 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -591,27 +591,27 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
    *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
    *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
   */
-#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
-  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                      \
-  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                              \
-  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) \
-  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)    \
-  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
-
-/*! \bried MKL Unary compute.
- *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
- *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
-*/
-#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)     \
-  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                      \
-  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                              \
-  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)\
-  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)    \
-  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
-
-#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)\
-  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                       \
-  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$)      \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)      \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
+
+  /*! \bried MKL Unary compute.
+   *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+   *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+  */
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)          \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)     \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
+
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)    \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
 #endif
 
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index dc876a02a2b4..8e37a488eebc 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -929,9 +929,9 @@ The storage type of ``cbrt`` output depends upon the input storage type:
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_cbrt,
                                                   unary_bwd<mshadow_op::cube_root_grad>);
+
 // erf
-#if MSHADOW_USE_MKL == 1
-MXNET_MKL_OPERATOR_REGISTER_UNARY(erf)
+MXNET_OPERATOR_REGISTER_UNARY(erf)
 .describe(R"code(Returns element-wise gauss error function of the input.
 
 Example::
@@ -939,20 +939,13 @@ Example::
    erf([0, -1., 10.]) = [0., -0.8427, 1.]
 
 )code" ADD_FILELINE)
+#if MSHADOW_USE_MKL == 1
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::erf, mkl_func::erf>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
 #else
-MXNET_OPERATOR_REGISTER_UNARY(erf)
-.describe(R"code(Returns element-wise gauss error function of the input.
-
-Example::
-
-   erf([0, -1., 10.]) = [0., -0.8427, 1.]
-
-)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::erf>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
 #endif    // MSHADOW_USE_MKL == 1
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
+
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_erf)
 .set_attr<FCompute>("FCompute<cpu>",
@@ -1030,18 +1023,6 @@ The storage type of ``exp`` output is always dense
 #endif
 
 // log
-#if MSHADOW_USE_MKL == 1
-MXNET_MKL_OPERATOR_REGISTER_UNARY(log)
-.describe(R"code(Returns element-wise Natural logarithmic value of the input.
-
-The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
-
-The storage type of ``log`` output is always dense
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
-#else
 MXNET_OPERATOR_REGISTER_UNARY(log)
 MXNET_ADD_SPARSE_OP_ALIAS(log)
 .describe(R"code(Returns element-wise Natural logarithmic value of the input.
@@ -1051,10 +1032,12 @@ The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
 The storage type of ``log`` output is always dense
 
 )code" ADD_FILELINE)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
+#else
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::log>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 #endif    // MSHADOW_USE_MKL == 1
-
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 
 // log10
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log10, cpu, mshadow_op::log10)

From 2c5c20c694f440a8a9b9335a634a76d70402f03d Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Sat, 9 Mar 2019 16:34:04 +0800
Subject: [PATCH 2/5] remove unecessary code

---
 src/operator/tensor/elemwise_unary_op.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 26938404018c..82a9aa9fe7a1 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -570,22 +570,6 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
     })                                                              \
   .add_argument("data", "NDArray-or-Symbol", "The input array.")
 
-/*! \bried MKL Unary compute. 
- *  With this macro means mxnet compile with MKL to accelerate math function with mkl. 
- *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function. 
- */
-#define MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                  \
-  NNVM_REGISTER_OP(__name$)                                         \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
-  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "The input array.")
-
 #if MSHADOW_USE_MKL == 1
   /*! \bried MKL Unary compute.
    *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.

From b1b635558f68a1fec7f1bc287e7d2a1e9f5e7265 Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Sun, 10 Mar 2019 09:22:18 +0800
Subject: [PATCH 3/5] Update test case

---
 tests/python/gpu/test_operator_gpu.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index b037218dc07f..08902563c9b0 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2200,7 +2200,7 @@ def test_context_num_gpus():
     assert mx.context.num_gpus() > 0
 
 def math_log(shape, dtype, check_value):
-    np_x = np.random.rand(shape[0], shape[1])
+    np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
     mx.nd.waitall()
     y = mx.nd.log(data=x)
@@ -2213,7 +2213,7 @@ def math_log(shape, dtype, check_value):
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def math_erf(shape, dtype, check_value):
-    np_x = np.random.rand(shape[0], shape[1])
+    np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
     mx.nd.waitall()
     y = mx.nd.erf(data=x)
@@ -2226,7 +2226,7 @@ def math_erf(shape, dtype, check_value):
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def math_square(shape, dtype, check_value):
-    np_x = np.random.rand(shape[0], shape[1])
+    np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
     mx.nd.waitall()
     y = mx.nd.square(data=x)
@@ -2252,12 +2252,9 @@ def run_math(op, shape, dtype="float32", check_value=True):
 def test_math():
     ops = ['log', 'erf', 'square']
     check_value= True
-    lshape = 1000
-    rshapes = [1, 10, 100, 1000, 10000]
+    shape_lst = [[1000], [100,1000], [10,100,100], [10,100,100,100]] 
     dtypes = ["float32", "float64"]
-    for rshape in rshapes:
-        shape = (lshape, rshape)
-        print("shape:(%d, %d), " % (lshape, rshape), end="")
+    for shape in shape_lst:
         for dtype in dtypes:
             for op in ops:
                 run_math(op, shape, dtype, check_value=check_value)

From f96c34a326c70a846fb7a3bca58d712af59d1817 Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Mon, 11 Mar 2019 11:56:17 +0800
Subject: [PATCH 4/5] minor fix

---
 tests/python/gpu/test_operator_gpu.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 08902563c9b0..533d7b0ed655 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2202,40 +2202,28 @@ def test_context_num_gpus():
 def math_log(shape, dtype, check_value):
     np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
-    mx.nd.waitall()
     y = mx.nd.log(data=x)
-    y.wait_to_read()
     if check_value:
         x_ = x.as_in_context(mx.cpu())
-        mx.nd.waitall()
         y_ = mx.nd.log(data=x_)
-        y_.wait_to_read()
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def math_erf(shape, dtype, check_value):
     np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
-    mx.nd.waitall()
     y = mx.nd.erf(data=x)
-    y.wait_to_read()
     if check_value:
         x_ = x.as_in_context(mx.cpu())
-        mx.nd.waitall()
         y_ = mx.nd.erf(data=x_)
-        y_.wait_to_read()
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def math_square(shape, dtype, check_value):
     np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
-    mx.nd.waitall()
     y = mx.nd.square(data=x)
-    y.wait_to_read()
     if check_value:
         x_ = x.as_in_context(mx.cpu())
-        mx.nd.waitall()
         y_ = mx.nd.square(data=x_)
-        y_.wait_to_read()
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def run_math(op, shape, dtype="float32", check_value=True):

From 06c51e974cbaa70939e6ac82c98e5ee1b9a6bdf3 Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Thu, 18 Apr 2019 22:02:25 +0800
Subject: [PATCH 5/5] move the position of MKL_Compute

---
 src/operator/tensor/elemwise_unary_op.h | 43 ++++++++++++-------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 82a9aa9fe7a1..279efcf97084 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -266,6 +266,27 @@ class UnaryOp : public OpBase {
   }
 
 #if MSHADOW_USE_MKL == 1
+  template<typename OP, typename MKL_OP>
+  static void MKL_Compute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+    if (req[0] == kNullOp)  return;
+    auto type_flag = inputs[0].type_flag_;
+    size_t input_size = inputs[0].Size();
+    if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
+        mkl_func::check_size(input_size) &&
+        mkl_func::check_type(type_flag)) {
+      // set DType as float or double according to type_flag   
+      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+        MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+      });
+    } else {
+      Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
+    }
+  }
+
   template<typename OP, typename MKL_OP>
   static void MKL_ComputeEx(const nnvm::NodeAttrs& attrs,
                             const OpContext& ctx,
@@ -375,28 +396,6 @@ class UnaryOp : public OpBase {
     }
   }
 
-#if MSHADOW_USE_MKL == 1
-  template<typename OP, typename MKL_OP>
-  static void MKL_Compute(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-    if (req[0] == kNullOp)  return;
-    auto type_flag = inputs[0].type_flag_;
-    size_t input_size = inputs[0].Size();
-    if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
-        mkl_func::check_size(input_size) &&
-        mkl_func::check_type(type_flag)) {
-      // set DType as float or double according to type_flag
-      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-        MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
-      });
-    } else {
-      Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
-    }
-  }
-#endif  // MSHADOW_USE_MKL == 1
 };
 
 /*! \brief Map legacy unary_bwd to backward_grad */