From 3026bae77b46e444b0931938fd56ba9cc592377c Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Sat, 2 Nov 2019 22:15:55 -0700
Subject: [PATCH 1/4] support mixed-precision true_divide (#16711)

---
 src/common/utils.h                            |  36 +++
 src/operator/leaky_relu-inl.h                 |   3 +-
 src/operator/mshadow_op.h                     |  40 +++
 src/operator/mxnet_op.h                       | 113 +++++++
 src/operator/nn/dropout-inl.h                 |   6 +-
 src/operator/nn/softmax-inl.h                 |   6 +-
 src/operator/numpy/np_true_divide-inl.h       | 284 +++++++++++++++---
 src/operator/numpy/np_true_divide.cc          |  51 +++-
 src/operator/numpy/np_true_divide.cu          |   2 +-
 .../tensor/elemwise_binary_broadcast_op.h     |  57 +++-
 .../tensor/elemwise_binary_scalar_op.h        |   2 +-
 tests/python/unittest/test_numpy_op.py        | 134 ++++++++-
 12 files changed, 660 insertions(+), 74 deletions(-)
diff --git a/src/common/utils.h b/src/common/utils.h
index 2b4b821a1835..b919cb301dff 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -842,6 +842,42 @@ inline bool is_float(const int dtype) {
   return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
 }
 
+inline int more_precise_type(const int type1, const int type2) {
+  if (type1 == type2) return type1;
+  if (is_float(type1) && is_float(type2)) {
+    if (type1 == mshadow::kFloat64 || type2 == mshadow::kFloat64) {
+      return mshadow::kFloat64;
+    }
+    if (type1 == mshadow::kFloat32 || type2 == mshadow::kFloat32) {
+      return mshadow::kFloat32;
+    }
+    return mshadow::kFloat16;
+  } else if (is_float(type1) || is_float(type2)) {
+    return is_float(type1) ? type1 : type2;
+  }
+  if (type1 == mshadow::kInt64 || type2 == mshadow::kInt64) {
+    return mshadow::kInt64;
+  }
+  if (type1 == mshadow::kInt32 || type2 == mshadow::kInt32) {
+    return mshadow::kInt32;
+  }
+  CHECK(!((type1 == mshadow::kUint8 && type2 == mshadow::kInt8) ||
+          (type1 == mshadow::kInt8 && type2 == mshadow::kUint8)))
+    << "1 is UInt8 and 1 is Int8 should not get here";
+  if (type1 == mshadow::kUint8 || type2 == mshadow::kUint8) {
+    return mshadow::kUint8;
+  }
+  return mshadow::kInt8;
+}
+
+inline int np_binary_out_type(const int type1, const int type2) {
+  if ((type1 == mshadow::kUint8 && type2 == mshadow::kInt8) ||
+      (type1 == mshadow::kInt8 && type2 == mshadow::kUint8)) {
+    return mshadow::kInt32;
+  }
+  return more_precise_type(type1, type2);
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index d73fa1be54a4..3d81cfc0d967 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -134,8 +134,7 @@ class LeakyReLUOp : public Operator {
             mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
             mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
             mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType,
-                                                               mshadow_op::xelu>, xpu>::
+            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, mshadow_op::xelu>, xpu>::
             template LaunchEx(s, new_oshape.Size(), req[leakyrelu::kOut], lstride, rstride, oshape,
             in_data[leakyrelu::kData].dptr<DType>(), in_data[leakyrelu::kGamma].dptr<DType>(),
             out_data[leakyrelu::kOut].dptr<DType>());
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index c5a2b1308c73..1ece97b0efd8 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -132,6 +132,26 @@ struct true_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static float Map(DType a, DType b) {
     return static_cast<float>(a) / static_cast<float>(b);
   }
+
+#ifndef _WIN32
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return static_cast<mshadow::half::half_t>(a) / b;
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return static_cast<float>(a) / b;
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return static_cast<double>(a) / b;
+  }
+#endif
 };
 
 struct rtrue_divide : public mxnet_op::tunable  {
@@ -146,6 +166,26 @@ struct rtrue_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static float Map(DType a, DType b) {
     return static_cast<float>(b) / static_cast<float>(a);
   }
+
+#ifndef _WIN32
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return b / static_cast<mshadow::half::half_t>(a);
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return b / static_cast<float>(a);
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return b / static_cast<double>(a);
+  }
+#endif
 };
 
 MXNET_BINARY_MATH_OP_NC(left, a);
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 91478660a123..5d297a547c8f 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -471,6 +471,69 @@ struct AccType<mshadow::half::half_t> {
       {__VA_ARGS__}                                        \
     }                                                      \
     break;                                                 \
+  case mshadow::kBool:                                     \
+    {                                                      \
+      typedef bool DType;                                  \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  default:                                                 \
+    LOG(FATAL) << "Unknown type enum " << type;            \
+  }
+
+#define MXNET_INT32_INT64_TYPE_SWITCH(type, DType, ...)\
+  switch (type) {                                          \
+  case mshadow::kFloat32:                                  \
+    {                                                      \
+      typedef float DType;                                 \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not float32";          \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat64:                                  \
+    {                                                      \
+      typedef double DType;                                \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not float64";          \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat16:                                  \
+    {                                                      \
+      typedef mshadow::half::half_t DType;                 \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not float16";          \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kUint8:                                    \
+    {                                                      \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not uint8";            \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt8:                                     \
+    {                                                      \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not int8";             \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt32:                                    \
+    {                                                      \
+      typedef int32_t DType;                               \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt64:                                    \
+    {                                                      \
+      typedef int64_t DType;                               \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kBool:                                     \
+    {                                                      \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not bool";             \
+    }                                                      \
+    break;                                                 \
   default:                                                 \
     LOG(FATAL) << "Unknown type enum " << type;            \
   }
@@ -783,6 +846,56 @@ struct op_with_req {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
   }
 
+#ifndef _WIN32
+  /*! \brief inputs are two tensors with a half_t output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  mshadow::half::half_t *out,
+                                  const DType *lhs,
+                                  const mshadow::half::half_t *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
+  /*! \brief inputs are two tensors with a float output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, float *out, const DType *lhs, const float *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
+  /*! \brief inputs are two tensors with a double output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, double *out, const DType *lhs, const double *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
+  /*! \brief inputs are two tensors with a half_t output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  mshadow::half::half_t *out,
+                                  const DType *lhs,
+                                  const mshadow::half::half_t value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], value));
+  }
+
+  /*! \brief inputs are two tensors with a float output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, float *out, const DType *lhs, const float value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], value));
+  }
+
+  /*! \brief inputs are two tensors with a double output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, double *out, const DType *lhs, const double value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], value));
+  }
+#endif
+
   /*! \brief inputs are two tensors with a float output tensor */
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 61239d33800c..1eff5cd8591d 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -394,8 +394,7 @@ class DropoutOp {
               mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
               mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
               mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-              mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType,
-                               mshadow_op::mul>, xpu>::
+              mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, mshadow_op::mul>, xpu>::
               template LaunchEx(s, new_oshape.Size(), req[dropout::kOut],
               lstride, rstride, oshape,
               in.dptr<DType>(),
@@ -463,8 +462,7 @@ class DropoutOp {
             mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
             mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
             mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType,
-                             mshadow_op::mul>, xpu>::
+            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, mshadow_op::mul>, xpu>::
             template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
             grad.dptr<DType>(), mask.dptr<DType>(), gdata.dptr<DType>());
           });
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 601a0526650c..89da570c133b 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -790,7 +790,7 @@ void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
           << "Mask needs to be provided when using softmax with use_length=True.";
         type = inputs[1].type_flag_;
       }
-      MXNET_INT_TYPE_SWITCH(type, IType, {
+      MXNET_INT32_INT64_TYPE_SWITCH(type, IType, {
           IType* mask_ptr = nullptr;
           if (param.use_length.value()) {
             mask_ptr = inputs[1].dptr<IType>();
@@ -834,7 +834,7 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
                         const std::vector<TBlob>& outputs) {
   using namespace mxnet_op;
   if (softmax_use_length(attrs)) {
-    MXNET_INT_TYPE_SWITCH(inputs[2].type_flag_, IType, {
+    MXNET_INT32_INT64_TYPE_SWITCH(inputs[2].type_flag_, IType, {
       if (req[1] != kNullOp) {
         mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
           ctx.get_stream<xpu>(), outputs[1].Size(), outputs[1].dptr<IType>());
@@ -856,7 +856,7 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
   MXNET_REAL_ACC_TYPE_SWITCH(inputs[0].type_flag_, OType, AType, {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        MXNET_INT_TYPE_SWITCH(itype, IType, {
+        MXNET_INT32_INT64_TYPE_SWITCH(itype, IType, {
           IType * length_ptr = nullptr;
           if (softmax_use_length(attrs)) {
             length_ptr = inputs[2].dptr<IType>();
diff --git a/src/operator/numpy/np_true_divide-inl.h b/src/operator/numpy/np_true_divide-inl.h
index cc74e19aef8f..0bc60a08803e 100644
--- a/src/operator/numpy/np_true_divide-inl.h
+++ b/src/operator/numpy/np_true_divide-inl.h
@@ -43,30 +43,42 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp || outputs[0].Size() == 0U) return;
   using namespace mshadow;
+  using namespace mxnet_op;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const double alpha = nnvm::get<double>(attrs.parsed);
-  if (common::is_float(inputs[0].type_flag_)) {
+  const TBlob& data = inputs[0];
+  const TBlob& out = outputs[0];
+  if (out.type_flag_ == data.type_flag_) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>(), DType(alpha));
+        Kernel<op_with_req<OP, Req>, xpu>::Launch(
+          s, data.Size(), out.dptr<DType>(), data.dptr<DType>(), DType(alpha));
       });
     });
   } else {
+#ifndef _WIN32
     CHECK_EQ(outputs[0].type_flag_, kFloat32) << "true_divide only supports float32 output "
                                                  "when input's dtype is "
                                               << type_string(inputs[0].type_flag_);
     MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-            s, inputs[0].Size(), outputs[0].dptr<float>(), inputs[0].dptr<DType>(), DType(alpha));
+        Kernel<op_with_req<OP, Req>, xpu>::Launch(
+          s, data.Size(), out.dptr<float>(), data.dptr<DType>(),
+          static_cast<float>(alpha));
       });
     });
+#else
+    Tensor<xpu, 1, float> temp_tensor =
+      ctx.requested[0].get_space_typed<xpu, 1, float>(mshadow::Shape1(data.Size()), s);
+    TBlob temp_tblob(temp_tensor);
+    CastCompute<xpu>(attrs, ctx, {data}, {kWriteTo}, {temp_tblob});
+    TrueDivideScalarCompute<xpu, OP>(attrs, ctx, {temp_tblob}, req, outputs);
+#endif
   }
 }
 
-template<typename xpu, typename OP>
+template<typename xpu>
 void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
                                const OpContext &ctx,
                                const std::vector<TBlob> &inputs,
@@ -77,66 +89,254 @@ void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    if (common::is_float(inputs[0].type_flag_)) {
-      MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-        Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, outputs[0].Size(),
-                                                            outputs[0].dptr<DType>(),
-                                                            inputs[0].dptr<DType>(),
-                                                            inputs[1].dptr<DType>());
+
+  const TBlob& lhs = inputs[0];
+  const TBlob& rhs = inputs[1];
+  const TBlob& out = outputs[0];
+  if (lhs.type_flag_ == rhs.type_flag_) {
+    // Case when types of the 2 input tensors are the same
+    if (common::is_float(lhs.type_flag_)) {
+      // If both are the same floats, normal launch
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
+          Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
+            s, out.Size(), out.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>());
+        });
       });
     } else {
-      CHECK_EQ(outputs[0].type_flag_, kFloat32) << "true_divide only supports float32 output "
-                                                   "when input's dtype is "
-                                                << type_string(inputs[0].type_flag_);
-      MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-        Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, outputs[0].Size(),
-                                                            outputs[0].dptr<float>(),
-                                                            inputs[0].dptr<DType>(),
-                                                            inputs[1].dptr<DType>());
+      // If both are the same integers, output is float32
+      CHECK_EQ(out.type_flag_, kFloat32) << "true_divide only supports float32 output "
+                                            "when input's dtype is "
+                                         << type_string(lhs.type_flag_);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
+          Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
+            s, out.Size(), out.dptr<float>(), lhs.dptr<DType>(), rhs.dptr<DType>());
+        });
       });
     }
-  });
+  } else {
+#ifndef _WIN32
+    // Non-windows case: no usage of temporary space
+    // Case when types of the 2 input tensors are different
+    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+      // both lhs and rhs are float types, output type is the more precise one
+      LOG(ERROR) << "not implemented yet...";
+    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+      // one is float type, the other is integer type, the output type should be the same as float
+      CHECK_EQ(out.type_flag_,
+               common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
+        << "This case out type should be same as the float type";
+      if (common::is_float(lhs.type_flag_)) {
+        // lhs is the float one
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+            MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+              Kernel<op_with_req<mshadow_op::rtrue_divide, Req>, xpu>::Launch(
+                s, out.Size(), out.dptr<LType>(), rhs.dptr<RType>(), lhs.dptr<LType>());
+            });
+          });
+        });
+      } else {
+        // rhs is the float one
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+            MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+              Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
+                s, out.Size(), out.dptr<RType>(), lhs.dptr<LType>(), rhs.dptr<RType>());
+            });
+          });
+        });
+      }
+    } else {
+      // lhs is integer type, rhs is integer type, output type should be float
+      LOG(ERROR) << "not implemented yet...";
+    }
+#else
+    // Windows case: using temp space for casting the type
+    // Case when types of the 2 input tensors are different
+    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+      // both lhs and rhs are float types, output type is the more precise one
+      LOG(ERROR) << "not implemented yet...";
+    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+      // lhs is float type, rhs is integer type, the output type should be the same as lhs
+      CHECK_EQ(out.type_flag_,
+               common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
+        << "This case out type should be same as the float type";
+      TBlob temp_tblob;
+      if (common::is_float(lhs.type_flag_)) {
+        // lhs is the float one
+        MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+          Tensor<xpu, 1, LType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
+        });
+        CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+        TrueDivideElemwiseCompute<xpu>(
+          attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+      } else {
+        // rhs is the float one
+        MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+          Tensor<xpu, 1, RType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
+        });
+        CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+        TrueDivideElemwiseCompute<xpu>(
+          attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+      }
+    } else {
+      // lhs is integer type, rhs is integer type, output type should be float
+      LOG(ERROR) << "not implemented yet...";
+    }
+#endif
+  }
 }
 
-template<typename xpu, typename OP>
+template<typename xpu>
 void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<TBlob>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
   if (outputs[0].shape_.Size() == 0U) return;
+  CHECK_EQ(inputs.size(), 2U);
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
   if (!ndim) {
-    TrueDivideElemwiseCompute<xpu, OP>(attrs, ctx, inputs, req, outputs);
+    TrueDivideElemwiseCompute<xpu>(attrs, ctx, inputs, req, outputs);
   } else {
     if (req[0] == kNullOp) return;
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TBlob& lhs = inputs[0];
+    const TBlob& rhs = inputs[1];
+    const TBlob& out = outputs[0];
+#ifndef _WIN32
     BROADCAST_NDIM_SWITCH(ndim, NDim, {
       mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
-      mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
-      mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-      if (common::is_float(inputs[0].type_flag_)) {
-        MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-          mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType, OP>, xpu>::
-            template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-                              inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
-                              outputs[0].dptr<DType>());
-        });
-      } else {
-        CHECK_EQ(outputs[0].type_flag_, mshadow::kFloat32)
+      mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
+      mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
+      if (lhs.type_flag_ == rhs.type_flag_) {
+        // When the both inputs have the same data types
+        if (common::is_float(lhs.type_flag_)) {
+          // If both inputs are the same float types, output is the same float type
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
+            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>());
+          });
+        } else {
+          CHECK_EQ(out.type_flag_, mshadow::kFloat32)
             << "true_divide only supports float32 output when input's dtype is "
-            << type_string(inputs[0].type_flag_);
-        MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-          mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, float, OP>, xpu>::
-            template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-                              inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
-                              outputs[0].dptr<float>());
-        });
+            << type_string(lhs.type_flag_);
+          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
+            // If both inputs are the same integer types, output is float type
+            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<float>());
+          });
+        }
+      } else {
+        if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+          // lhs and rhs have different float types, the output is the more precise one
+          LOG(ERROR) << "not implemented yet...";
+        } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+          // one of lhs and rhs is float, the output is the same type as the float one
+          if (common::is_float(lhs.type_flag_)) {
+            // lhs is float type, output will be the same float type
+            CHECK_EQ(lhs.type_flag_, out.type_flag_)
+              << "lhs should have the same type as out, infer type broken?";
+            MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+              MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+                Kernel<binary_broadcast_kernel<NDim, mshadow_op::rtrue_divide>, xpu>::
+                  template LaunchEx(s, new_oshape.Size(), req[0], rstride, lstride, oshape,
+                                    rhs.dptr<RType>(), lhs.dptr<LType>(), out.dptr<LType>());
+              });
+            });
+          } else {
+            // rhs is float type, output will be the same float type
+            CHECK_EQ(rhs.type_flag_, out.type_flag_)
+              << "rhs should have the same type as out, infer type broken?";
+            MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+              MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+                Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+                  template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                    lhs.dptr<LType>(), rhs.dptr<RType>(), out.dptr<RType>());
+              });
+            });
+          }
+        } else {
+          // lhs and rhs have different integer types, the output is float type
+          LOG(ERROR) << "not implemented yet...";
+        }
       }
     });
+#else
+    if (lhs.type_flag_ == rhs.type_flag_) {
+      BROADCAST_NDIM_SWITCH(ndim, NDim, {
+        mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
+        mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
+        mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
+        // When the both inputs have the same data types
+        if (common::is_float(lhs.type_flag_)) {
+          // If both inputs are the same float types, output is the same float type
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
+            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>());
+          });
+        } else {
+          CHECK_EQ(out.type_flag_, mshadow::kFloat32)
+            << "true_divide only supports float32 output when input's dtype is "
+            << type_string(lhs.type_flag_);
+          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
+            // If both inputs are the same integer types, output is float type
+            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<float>());
+          });
+        }
+      });
+    } else {
+      if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+        // lhs and rhs have different float types, the output is the more precise one
+        LOG(ERROR) << "not implemented yet...";
+      } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+        // one of lhs and rhs is float, the output is the same type as the float one
+        TBlob temp_tblob;
+        if (common::is_float(lhs.type_flag_)) {
+          // lhs is float type, output will be the same float type
+          CHECK_EQ(lhs.type_flag_, out.type_flag_)
+            << "lhs should have the same type as out, infer type broken?";
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+            Tensor<xpu, 1, LType> temp_tensor =
+              ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
+            temp_tblob = TBlob(temp_tensor);
+          });
+          CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+          TrueDivideBroadcastCompute<xpu>(
+            attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+        } else {
+          // rhs is float type, output will be the same float type
+          CHECK_EQ(rhs.type_flag_, out.type_flag_)
+            << "rhs should have the same type as out, infer type broken?";
+          MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+            Tensor<xpu, 1, RType> temp_tensor =
+              ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
+            temp_tblob = TBlob(temp_tensor);
+          });
+          CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+          TrueDivideBroadcastCompute<xpu>(
+            attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+        }
+      } else {
+        // lhs and rhs have different integer types, the output is float type
+        LOG(ERROR) << "not implemented yet...";
+      }
+    }
+#endif
   }
 }
 
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
index 5a4634c3ff8c..d2135befef42 100644
--- a/src/operator/numpy/np_true_divide.cc
+++ b/src/operator/numpy/np_true_divide.cc
@@ -28,26 +28,35 @@
 namespace mxnet {
 namespace op {
 
+int TrueDivideOutType(int ltype, int rtype) {
+  if (common::is_float(ltype) && common::is_float(rtype)) {
+    // If both inputs are float, return the one with the higher precision
+    return common::more_precise_type(ltype, rtype);
+  } else if (common::is_float(ltype) || common::is_float(rtype)) {
+    // If only one of the inputs is float, return that float type
+    return (common::is_float(ltype)) ? ltype : rtype;
+  }
+  // If neither of the inputs is float, return the default float32 type
+  return mshadow::kFloat32;
+}
+
 template <int num_inputs>
 bool TrueDivideType(const nnvm::NodeAttrs& attrs,
                     std::vector<int>* in_attrs,
                     std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(num_inputs));
+  CHECK_GT(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 1U);
+
   for (const int dtype : *in_attrs) {
     if (dtype == -1) return false;
   }
-  if (num_inputs == 2) {
-    const int lhs_dtype = in_attrs->at(0);
-    const int rhs_dtype = in_attrs->at(1);
-    CHECK_EQ(lhs_dtype, rhs_dtype)
-        << "true_divide currently only supports same dtype for dividend and divisor";
-  }
-  if (common::is_float(in_attrs->at(0))) {
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  } else {
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
-  }
+
+  const int lhs_dtype = in_attrs->at(0);
+  const int rhs_dtype = (num_inputs == 2) ?
+                        in_attrs->at(1) :
+                        (common::is_float(lhs_dtype) ? lhs_dtype : mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, TrueDivideOutType(lhs_dtype, rhs_dtype));
   return true;
 }
 
@@ -64,7 +73,13 @@ NNVM_REGISTER_OP(_npi_true_divide)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
   })
-.set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu, op::mshadow_op::true_divide>)
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
+.set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
 .add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
 .add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
@@ -81,6 +96,12 @@ NNVM_REGISTER_OP(_npi_true_divide_scalar)
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, op::mshadow_op::true_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
@@ -98,6 +119,12 @@ NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, mshadow_op::rtrue_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_rdiv_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
index c026d689233d..7211f4a0a006 100644
--- a/src/operator/numpy/np_true_divide.cu
+++ b/src/operator/numpy/np_true_divide.cu
@@ -29,7 +29,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_true_divide)
-.set_attr<FCompute>("FCompute<gpu>", TrueDivideBroadcastCompute<gpu, mshadow_op::true_divide>);
+.set_attr<FCompute>("FCompute<gpu>", TrueDivideBroadcastCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_attr<FCompute>("FCompute<gpu>", TrueDivideScalarCompute<gpu, mshadow_op::true_divide>);
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 3d3bcfacbd05..ad06df8d92be 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -187,9 +187,10 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet:
 }
 
 namespace mxnet_op {
-template<int ndim, typename IType, typename DType, typename OP>
+template<int ndim, typename OP>
 struct binary_broadcast_kernel {
   /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType>
   MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
                                   const Shape <ndim> &lstride, const Shape <ndim> &rstride,
                                   const Shape <ndim> &oshape, IType *lhs, IType *rhs,
@@ -208,6 +209,7 @@ struct binary_broadcast_kernel {
   }
 
   /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType>
   MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
                                   const Shape <ndim> &lstride, const Shape <ndim> &rstride,
                                   const Shape <ndim> &oshape, IType lhs, IType *rhs,
@@ -224,6 +226,49 @@ struct binary_broadcast_kernel {
       KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
     }
   }
+
+#ifndef _WIN32
+  /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType,
+           typename std::enable_if<!std::is_same<IType, DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, IType *lhs, DType *rhs,
+                                  DType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
+    }
+  }
+
+  /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType,
+           typename std::enable_if<!std::is_same<IType, DType>::value &&
+                                   !std::is_pointer<IType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, IType lhs, DType *rhs,
+                                  DType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
+    }
+  }
+#endif
 };
 
 template<int req, typename OP, bool col_vec>
@@ -307,7 +352,7 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
           mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
           mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
           mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-          mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType, OP>, xpu>::
+          mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, OP>, xpu>::
           template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
           inputs[0].dptr<DType>(), inputs[1].dptr<DType>(), outputs[0].dptr<DType>());
         });
@@ -336,7 +381,7 @@ void BinaryBroadcastComputeLogic(const nnvm::NodeAttrs& attrs,
             mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
             mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
             mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, bool, OP>, xpu>::
+            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, OP>, xpu>::
             template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
                               inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
                               outputs[0].dptr<bool>());
@@ -444,11 +489,11 @@ void BinaryBroadcastCsrDnsDnsImpl(const OpContext& ctx,
       Shape<NDim> lstride = calc_stride(new_csrshape.get<NDim>());
       Shape<NDim> rstride = calc_stride(new_dnsshape.get<NDim>());
       if (reverse && std::is_same<OP, mshadow_op::minus>::value) {
-        Kernel<binary_broadcast_kernel<NDim, DType, DType, mshadow_op::plus>, xpu>::
+        Kernel<binary_broadcast_kernel<NDim, mshadow_op::plus>, xpu>::
         template LaunchEx(s, new_oshape.Size(), req, lstride, rstride, oshape,
         DType(0), dns_data.dptr<DType>(), out_data.dptr<DType>());
       } else {
-        Kernel<binary_broadcast_kernel<NDim, DType, DType, OP>, xpu>::
+        Kernel<binary_broadcast_kernel<NDim, OP>, xpu>::
         template LaunchEx(s, new_oshape.Size(), req, lstride, rstride, oshape,
         DType(0), dns_data.dptr<DType>(), out_data.dptr<DType>());
       }
@@ -658,7 +703,7 @@ void BinaryBroadcastBackwardUseIn(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs) {                                      \
       return std::vector<std::string>{"lhs", "rhs"};                  \
     })                                                                \
-  .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)   \
+  .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)  \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)       \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                   \
     [](const NodeAttrs& attrs){                                       \
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 02b005eed995..834bbdbfc3d1 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -256,7 +256,7 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const double alpha = nnvm::get<double>(attrs.parsed);
-    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
           mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
               s, inputs[0].Size(), outputs[0].dptr<bool>(), inputs[0].dptr<DType>(), DType(alpha));
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index a2716fb5363f..d04b97b64342 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1940,7 +1940,7 @@ def get_new_shape(shape, axis):
 
                     with mx.autograd.record():
                         y = test_concat(a, b, c, d)
-                    
+
                     assert y.shape == expected_ret.shape
                     assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
 
@@ -2871,6 +2871,118 @@ def check_svd(UT, L, V, data_np):
                 check_svd(UT, L, V, data_np)
 
 
+@with_seed()
+@use_np
+def test_np_linalg_cholesky():
+    class TestCholesky(HybridBlock):
+        def __init__(self):
+            super(TestCholesky, self).__init__()
+
+        def hybrid_forward(self, F, data):
+            return F.np.linalg.cholesky(data)
+
+    def get_grad(L):
+        # shape of m is [batch, n, n]
+        if 0 in L.shape:
+            return L
+
+        def copyltu(m):
+            eye = _np.array([_np.eye(m.shape[-1]) for i in range(m.shape[0])])
+            lower = _np.tril(m) - eye * m
+            lower_mask = _np.tril(_np.ones_like(m))
+            ret = lower_mask * m + lower.swapaxes(-1, -2)
+            return ret
+
+        shape = L.shape
+        L = L.reshape(-1, shape[-2], shape[-1])
+        dL = _np.ones_like(L)
+        L_inv = _np.linalg.inv(L)
+        L_inv_T = L_inv.swapaxes(-1, -2)
+        L_T = L.swapaxes(-1, -2)
+        sym_L_inv = 0.5 * (L_inv + L_inv_T)
+        dA = 0.5 * _np.matmul(_np.matmul(L_inv_T, copyltu(_np.matmul(L_T, dL))), L_inv)
+        return dA.reshape(shape)
+
+    def check_cholesky(L, data_np):
+        assert L.shape == data_np.shape
+        # catch error if numpy throws rank < 2
+        try:
+            L_expected = _np.linalg.cholesky(data_np)
+        except Exception as e:
+            print(data_np)
+            print(data_np.shape)
+            print(e)
+        else:
+            assert L.shape == L_expected.shape
+            assert_almost_equal(L.asnumpy(), L_expected, rtol=rtol, atol=atol)
+
+    shapes = [
+        (0, 0),
+        (1, 1),
+        (5, 5),
+        (6, 6),
+        (6, 6, 6),
+        (1, 0, 0),
+        (0, 1, 1),
+        (2, 3, 4, 4),
+    ]
+    dtypes = ['float32', 'float64']
+    for hybridize, dtype, shape in itertools.product([True, False], dtypes, shapes):
+        atol = rtol = 1e-2
+
+        test_cholesky = TestCholesky()
+        if hybridize:
+            test_cholesky.hybridize()
+
+        # Numerical issue:
+        # When backpropagating through Cholesky decomposition, we need to compute the inverse
+        # of L according to dA = 0.5 * L**(-T) * copyLTU(L**T * dL) * L**(-1) where A = LL^T.
+        # The inverse is calculated by "trsm" method in CBLAS. When the data type is float32,
+        # this causes numerical instability. It happens when the matrix is ill-conditioned.
+        # In this example, the issue occurs frequently if the symmetric positive definite input
+        # matrix A is constructed by A = LL^T + \epsilon * I. A proper way of testing such
+        # operators involving numerically unstable operations is to use well-conditioned random
+        # matrices as input. Here we test Cholesky decomposition for FP32 and FP64 separately.
+        # See rocBLAS:
+        # https://github.com/ROCmSoftwarePlatform/rocBLAS/wiki/9.Numerical-Stability-in-TRSM
+
+        # generate symmetric PD matrices
+        if 0 in shape:
+            data_np = np.ones(shape)
+        else:
+            data_np_l = _np.random.uniform(-10., 10., shape)
+            if dtype == 'float32':
+                data_np_l_flat = data_np_l.reshape((-1, shape[-2], shape[-1]))
+            else:
+                data_np_l_flat = _np.tril(data_np_l.reshape((-1, shape[-2], shape[-1])))
+            for i in range(data_np_l_flat.shape[0]):
+                for j in range(data_np_l_flat.shape[-1]):
+                    if data_np_l_flat[i, j, j] < 0:
+                        data_np_l_flat[i, j, j] = -data_np_l_flat[i, j, j]
+                    elif data_np_l_flat[i, j, j] == 0:
+                        data_np_l_flat[i, j, j] = 2
+            data_np = _np.matmul(data_np_l_flat, data_np_l_flat.swapaxes(-1, -2))
+            data_np = data_np.reshape(shape)
+
+        # When dtype is np.FP32, truncation from FP64 to FP32 could also be a source of
+        # instability since the ground-truth gradient is computed using FP64 data.
+        data = np.array(data_np, dtype=dtype)
+        data.attach_grad()
+        with mx.autograd.record():
+            L = test_cholesky(data)
+
+        # check cholesky validity
+        check_cholesky(L, data_np)
+        # check backward. backward does not support empty input
+        if 0 not in L.shape:
+            mx.autograd.backward(L)
+            backward_expected = get_grad(L.asnumpy())
+            assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
+        # check imperative once again
+        L = np.linalg.cholesky(data)
+        check_cholesky(L, data_np)
+
+
 @with_seed()
 @use_np
 def test_np_vstack():
@@ -3735,12 +3847,14 @@ def test_np_true_divide():
         [(2, 3, 1), (1, 4)],
         [(2, 1, 4, 1), (3, 1, 5)],
     ]
-    dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
+    dtypes = [np.bool, np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
+    itypes = [np.bool, np.int8, np.uint8, np.int32, np.int64]
+    ftypes = [np.float16, np.float32, np.float64]
     for shape_pair, dtype in itertools.product(shapes, dtypes):
         a = np.random.uniform(3, 50, size=shape_pair[0]).astype(dtype)
         b = np.random.uniform(3, 50, size=shape_pair[-1]).astype(dtype)
         out_mx = a / b
-        if _np.issubdtype(dtype, _np.integer):
+        if _np.issubdtype(dtype, _np.integer) or (dtype is np.bool):
             assert out_mx.dtype == np.float32
         else:
             assert out_mx.dtype == dtype
@@ -3756,6 +3870,20 @@ def test_np_true_divide():
         out_np = _np.true_divide(val, a.asnumpy())
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
 
+    for shape_pair, itype, ftype in itertools.product(shapes, itypes, ftypes):
+        i_ = np.random.uniform(3, 50, size=shape_pair[0]).astype(itype)
+        f_ = np.random.uniform(3, 50, size=shape_pair[-1]).astype(ftype)
+
+        out_mx = i_ / f_
+        assert out_mx.dtype == ftype
+        out_np = _np.true_divide(i_.asnumpy(), f_.asnumpy())
+        assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
+
+        out_mx = f_ / i_
+        assert out_mx.dtype == ftype
+        out_np = _np.true_divide(f_.asnumpy(), i_.asnumpy())
+        assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
+
 
 @with_seed()
 @use_np

From f93f69c25323a2ea17b37fd53ceaa2652207af9e Mon Sep 17 00:00:00 2001
From: Wuxun Zhang <wuxun.zhang@intel.com>
Date: Fri, 8 Nov 2019 09:30:02 +0800
Subject: [PATCH 2/4] [MKLDNN] use dim_t instead of int in slice/transpose
 operators (#16737)

* use dim_t instead of int

* fix same issue in pooling

* rebase code

* trigger CI
---
 src/operator/nn/mkldnn/mkldnn_pooling.cc   | 2 +-
 src/operator/nn/mkldnn/mkldnn_slice.cc     | 2 +-
 src/operator/nn/mkldnn/mkldnn_transpose.cc | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc
index f9dbe5bbfd8f..6eda2aa33b34 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling.cc
+++ b/src/operator/nn/mkldnn/mkldnn_pooling.cc
@@ -127,7 +127,7 @@ mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam &param) {
   }
 }
 
-static inline int GetPaddingSizeFull(int x, int padl, int padr, int k, int s) {
+static inline int GetPaddingSizeFull(dim_t x, int padl, int padr, int k, int s) {
   if ((x + padl + padr - k) % s != 0) {
     return (padr + s - ((x + padl + padr - k) % s));
   } else {
diff --git a/src/operator/nn/mkldnn/mkldnn_slice.cc b/src/operator/nn/mkldnn/mkldnn_slice.cc
index dba10f8b6cd5..575554a25c88 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice.cc
+++ b/src/operator/nn/mkldnn/mkldnn_slice.cc
@@ -41,7 +41,7 @@ MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam &param,
   mkldnn::memory::dims dims(N);
   mkldnn::memory::dims offsets(N);
   for (int i = 0; i < N; ++i) {
-    int s = 0;
+    dim_t s = 0;
     if (i < param.begin.ndim() &&  param.begin[i]) {
       s = *param.begin[i];
       if (s < 0) s += ishape[i];
diff --git a/src/operator/nn/mkldnn/mkldnn_transpose.cc b/src/operator/nn/mkldnn/mkldnn_transpose.cc
index 2ec38d586552..ee9c06d49744 100644
--- a/src/operator/nn/mkldnn/mkldnn_transpose.cc
+++ b/src/operator/nn/mkldnn/mkldnn_transpose.cc
@@ -73,7 +73,7 @@ class MKLDNNTransposeForward {
 
     mkldnn_dims_t strides;
     mkldnn_dims_t sh;
-    unsigned int total_stride = 1;
+    dim_t total_stride = 1;
     for (int i = data_ndim - 1; i >= 0; i--) {
       sh[i] = shape[i];
       strides[axes[i]] = total_stride;

From 0af550a2a1fd9611a6f50b7a6c462b084ddfdd5a Mon Sep 17 00:00:00 2001
From: Caenorst <cfujitsang@nvidia.com>
Date: Wed, 6 Nov 2019 21:00:43 -0500
Subject: [PATCH 3/4] Add MXNet Ops for fast multihead attention (#16408)

* add MXNet Ops for fast multihead attention

* add cutlass as 3rdparty dependency

* add cutlass to compilation flags

* remove all cutlass stuff

* add better error message and description and remove cutlass from compilation flags

* change credit for the approach since the code have changed

* fix typos

* correct another typo

* Add all the cuda/cublas helper functions

* remove tests using kAddTo

* only use cublasStridedBatchedGemm if CUDA >= 9.1

* add equivalent mxnet code in description of mha ops

* remove a wrong copy-paste

* add _contrib for namespace and add GPU only on description

* add warning in bwd_ignore_zero_init description, also test with fp32

* add error return if bwd_ignore_zero_init is used without MXNET_EXEC_ENABLE_ADDTO

* remove std::move for clang

* remove bwd_ignore_zero_init flag

* remove bwd_ignore_zero_init in test_operator_gpu.py

* fix typo

* fix another typo
---
 src/common/cuda_utils.h                |  74 ++++
 src/operator/contrib/transformer-inl.h |   9 +
 src/operator/contrib/transformer.cc    | 270 ++++++++++++
 src/operator/contrib/transformer.cu    | 560 +++++++++++++++++++++++++
 tests/python/gpu/test_operator_gpu.py  | 316 +++++++++++++-
 5 files changed, 1228 insertions(+), 1 deletion(-)

diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 2f7254040475..ccf0931f2480 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -187,6 +187,69 @@ namespace mxnet {
 namespace common {
 /*! \brief common utils for cuda */
 namespace cuda {
+/*!
+ * \brief Converts between C++ datatypes and enums/constants needed by cuBLAS.
+ */
+template<typename DType>
+struct CublasType;
+
+// With CUDA v8, cuBLAS adopted use of cudaDataType_t instead of its own
+// datatype cublasDataType_t.  The older cudaDataType_t values could be
+// included below, but since this class was introduced to support the cuBLAS v8
+// call cublasGemmEx(), burdening the class with the legacy type values
+// was not needed.
+
+template<>
+struct CublasType<float> {
+  static const int kFlag = mshadow::kFloat32;
+#if CUDA_VERSION >= 8000
+  static const cudaDataType_t kCudaFlag = CUDA_R_32F;
+#endif
+  typedef float ScaleType;
+  static const float one;
+  static const float zero;
+};
+template<>
+struct CublasType<double> {
+  static const int kFlag = mshadow::kFloat64;
+#if CUDA_VERSION >= 8000
+  static const cudaDataType_t kCudaFlag = CUDA_R_64F;
+#endif
+  typedef double ScaleType;
+  static const double one;
+  static const double zero;
+};
+template<>
+struct CublasType<mshadow::half::half_t> {
+  static const int kFlag = mshadow::kFloat16;
+#if CUDA_VERSION >= 8000
+  static const cudaDataType_t kCudaFlag = CUDA_R_16F;
+#endif
+  typedef float ScaleType;
+  static const mshadow::half::half_t one;
+  static const mshadow::half::half_t zero;
+};
+template<>
+struct CublasType<uint8_t> {
+  static const int kFlag = mshadow::kUint8;
+#if CUDA_VERSION >= 8000
+  static const cudaDataType_t kCudaFlag = CUDA_R_8I;
+#endif
+  typedef uint8_t ScaleType;
+  static const uint8_t one = 1;
+  static const uint8_t zero = 0;
+};
+template<>
+struct CublasType<int32_t> {
+  static const int kFlag = mshadow::kInt32;
+#if CUDA_VERSION >= 8000
+  static const cudaDataType_t kCudaFlag = CUDA_R_32I;
+#endif
+  typedef int32_t ScaleType;
+  static const int32_t one = 1;
+  static const int32_t zero = 0;
+};
+
 /*!
  * \brief Get string representation of cuBLAS errors.
  * \param error The error.
@@ -218,6 +281,17 @@ inline const char* CublasGetErrorString(cublasStatus_t error) {
   return "Unknown cuBLAS status";
 }
 
+#if CUDA_VERSION >= 8000
+/*!
+ * \brief Create the proper constant for indicating cuBLAS transposition, if desired.
+ * \param transpose Whether transposition should be performed.
+ * \return the yes/no transposition-indicating constant.
+ */
+inline cublasOperation_t CublasTransposeOp(bool transpose) {
+  return transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+}
+#endif
+
 /*!
  * \brief Get string representation of cuSOLVER errors.
  * \param error The error.
diff --git a/src/operator/contrib/transformer-inl.h b/src/operator/contrib/transformer-inl.h
index da3d14e33cf4..da48ffa52dca 100644
--- a/src/operator/contrib/transformer-inl.h
+++ b/src/operator/contrib/transformer-inl.h
@@ -34,6 +34,15 @@
 namespace mxnet {
 namespace op {
 
+struct InterleavedMatMulParam : public dmlc::Parameter<InterleavedMatMulParam> {
+  int heads;
+  bool bwd_ignore_zero_init;
+  DMLC_DECLARE_PARAMETER(InterleavedMatMulParam) {
+    DMLC_DECLARE_FIELD(heads)
+    .describe("Set number of heads");
+  }
+};
+
 template<typename xpu>
 static void DivSqrtDimForward_(const nnvm::NodeAttrs& attrs,
                   const OpContext& ctx,
diff --git a/src/operator/contrib/transformer.cc b/src/operator/contrib/transformer.cc
index 00085c0dc7aa..2ca6f8c71093 100644
--- a/src/operator/contrib/transformer.cc
+++ b/src/operator/contrib/transformer.cc
@@ -29,6 +29,276 @@
 namespace mxnet {
 namespace op {
 
+DMLC_REGISTER_PARAMETER(InterleavedMatMulParam);
+
+static bool InterleavedMatMulSelfAttQKShape(const NodeAttrs& attrs,
+                                            mxnet::ShapeVector* in_shape,
+                                            mxnet::ShapeVector* out_shape) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U) << "Input:[queries_keys_values] currently have, "
+                                 << in_shape->size() << " inputs";
+  auto qkv_shape = in_shape->at(0);
+  CHECK_EQ(qkv_shape.ndim(), 3U)
+    << "Input queries_keys_values should be 3D in seq_length-batch-proj_dim, "
+    << "currently is: " << qkv_shape.ndim() << "D";
+  out_shape->resize(1);
+  SHAPE_ASSIGN_CHECK(*out_shape, 0,
+    mxnet::TShape({params.heads * qkv_shape[1], qkv_shape[0], qkv_shape[0]}));
+  return true;
+}
+
+static bool InterleavedMatMulSelfAttValAttShape(const NodeAttrs& attrs,
+                                                mxnet::ShapeVector* in_shape,
+                                                mxnet::ShapeVector* out_shape) {
+  CHECK_EQ(in_shape->size(), 2U) << "Input:[queries_keys_values, attention] currently have, "
+                                 << in_shape->size() << " inputs";
+  auto qkv_shape = in_shape->at(0);
+  auto att_shape = in_shape->at(1);
+  CHECK_EQ(qkv_shape.ndim(), 3U)
+    << "Input queries_keys_values should be 3D in seq_length-batch-3*proj_dim, "
+    << "currently is: " << qkv_shape.ndim() << "D";
+  CHECK_EQ(att_shape.ndim(), 3U)
+    << "Input attention should be 3D in batch-seq_length-seq_length, "
+    << "currently is: " << att_shape.ndim() << "D";
+  CHECK_EQ(qkv_shape[0], att_shape[1])
+    << "queries_keys_values.shape[0] and attention.shape[1] should be the same, "
+    << "currently are " << qkv_shape[0] << " and " << att_shape[1];
+  CHECK_EQ(qkv_shape[0], att_shape[2])
+    << "queries_keys_values.shape[0] and attention.shape[2] should be the same, "
+    << "currently are " << qkv_shape[0] << " and " << att_shape[2];
+  CHECK_EQ(qkv_shape[2] % 3, 0)
+    << "queries_keys_values.shape[2] should be a multiple of 3, "
+    << "currently is " << qkv_shape[2];
+  SHAPE_ASSIGN_CHECK(*out_shape, 0,
+    mxnet::TShape({qkv_shape[0], qkv_shape[1], qkv_shape[2] / 3}));
+  return true;
+}
+
+static bool InterleavedMatMulEncDecQKShape(const NodeAttrs& attrs,
+                                           mxnet::ShapeVector* in_shape,
+                                           mxnet::ShapeVector* out_shape) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 2U) << "Input:[queries, keys_values], currently have "
+                                 << in_shape->size() << " inputs";
+  auto q_shape = in_shape->at(0);
+  auto kv_shape = in_shape->at(1);
+  CHECK_EQ(q_shape.ndim(), 3U) << "Input queries should be 3D in seq_length-batch-proj_dim, "
+                               << "currently is " << q_shape.ndim() << "D";
+  CHECK_EQ(kv_shape.ndim(), 3U) << "Input queries should be 3D in seq_length-batch-2*proj_dim, "
+                                << "currently is " << kv_shape.ndim() << "D";
+  CHECK_EQ(q_shape[2] * 2, kv_shape[2])
+    << "keys_values.shape[2] should be equal to queries.shape[2] * 2, "
+    << "currently are: " << kv_shape[2] << " and " << q_shape[2];
+  CHECK_EQ(q_shape[1], kv_shape[1])
+    << "queries.shape[1] should be equal to keys_values.shape[1], "
+    << "currently are: " << q_shape[1] << " and " << kv_shape[1];
+  SHAPE_ASSIGN_CHECK(*out_shape, 0,
+      mxnet::TShape({q_shape[1] * params.heads, q_shape[0], kv_shape[0]}));
+  return true;
+}
+
+static bool InterleavedMatMulEncDecValAttShape(const NodeAttrs& attrs,
+                                               mxnet::ShapeVector* in_shape,
+                                               mxnet::ShapeVector* out_shape) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 2U) << "Input: [keys_values, attention], currently have "
+                                 << in_shape->size() << " inputs";
+  auto kv_shape = in_shape->at(0);
+  auto att_shape = in_shape->at(1);
+  CHECK_EQ(kv_shape.ndim(), 3U)
+    << "Input keys_values should be 3D in seq_length-batch-2*proj_dim, "
+    << "currently is " << kv_shape.ndim() << "D";
+  CHECK_EQ(att_shape.ndim(), 3U)
+    << "Input attention should be 3D in batch-seq_length-seq_length, "
+    << "currently is " << att_shape.ndim() << "D";
+  CHECK_EQ(kv_shape[0], att_shape[2])
+    << "keys_values.shape[0] should be equal to attention.shape[2], currently are "
+    << kv_shape[0] << " and " << att_shape[2];
+  CHECK_EQ(kv_shape[1] * params.heads, att_shape[0]) << "attention.shape[0] "
+    << "should be equal to keys_values.shape[1] * heads, currently are: "
+    << att_shape[2] << " and " << kv_shape[1];
+  SHAPE_ASSIGN_CHECK(*out_shape, 0,
+      mxnet::TShape({att_shape[1], kv_shape[1], kv_shape[2] / 2}));
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_interleaved_matmul_selfatt_qk)
+.describe(R"code(Compute the matrix multiplication between the projections of
+queries and keys in multihead attention use as self attention.
+
+the input must be a single tensor of interleaved projections
+of queries, keys and values following the layout:
+(seq_length, batch_size, num_heads * head_dim * 3)
+
+the equivalent code would be:
+tmp = mx.nd.reshape(queries_keys_values, shape=(0, 0, num_heads, 3, -1))
+q_proj = mx.nd.transpose(tmp[:,:,:,0,:], axes=(1, 2, 0, 3))
+q_proj = mx.nd.reshape(q_proj, shape=(-1, 0, 0), reverse=True)
+q_proj = mx.nd.contrib.div_sqrt_dim(q_proj)
+k_proj = mx.nd.transpose(tmp[:,:,:,1,:], axes=(1, 2, 0, 3))
+k_proj = mx.nd.reshap(k_proj, shape=(-1, 0, 0), reverse=True)
+output = mx.nd.batch_dot(q_proj, k_proj, transpose_b=True)
+
+This Op is GPU only
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InterleavedMatMulParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"queries_keys_values"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", InterleavedMatMulSelfAttQKShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FGradient>("FGradient",
+  ElemwiseGradUseIn{"_backward_interleaved_matmul_selfatt_qk"})
+.add_argument("queries_keys_values", "NDArray-or-Symbol", "Interleaved queries, keys and values")
+.add_arguments(InterleavedMatMulParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_interleaved_matmul_selfatt_qk)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<InterleavedMatMulParam>);
+
+NNVM_REGISTER_OP(_contrib_interleaved_matmul_selfatt_valatt)
+.describe(R"code(Compute the matrix multiplication between the projections of
+values and the attention weights in multihead attention use as self attention.
+
+the inputs must be a tensor of interleaved projections
+of queries, keys and values following the layout:
+(seq_length, batch_size, num_heads * head_dim * 3)
+
+and the attention weights following the layout:
+(batch_size, seq_length, seq_length)
+
+the equivalent code would be:
+tmp = mx.nd.reshape(queries_keys_values, shape=(0, 0, num_heads, 3, -1))
+v_proj = mx.nd.transpose(tmp[:,:,:,2,:], axes=(1, 2, 0, 3))
+v_proj = mx.nd.reshape(v_proj, shape=(-1, 0, 0), reverse=True)
+output = mx.nd.batch_dot(attention, v_proj, transpose_b=True)
+output = mx.nd.reshape(output, shape=(-1, num_heads, 0, 0), reverse=True)
+output = mx.nd.transpose(output, axes=(0, 2, 1, 3))
+output = mx.nd.reshape(output, shape=(0, 0, -1))
+
+This Op is GPU only
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InterleavedMatMulParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"queries_keys_values", "attention"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", InterleavedMatMulSelfAttValAttShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<nnvm::FGradient>("FGradient",
+  ElemwiseGradUseIn{"_backward_interleaved_matmul_selfatt_valatt"})
+.add_argument("queries_keys_values", "NDArray-or-Symbol", "Queries, keys and values interleaved")
+.add_argument("attention", "NDArray-or-Symbol", "Attention maps")
+.add_arguments(InterleavedMatMulParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_interleaved_matmul_selfatt_valatt)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<InterleavedMatMulParam>);
+
+NNVM_REGISTER_OP(_contrib_interleaved_matmul_encdec_qk)
+.describe(R"code(Compute the matrix multiplication between the projections of
+queries and keys in multihead attention use as encoder-decoder.
+
+the inputs must be a tensor of projections of queries following the layout:
+(seq_length, batch_size, num_heads * head_dim)
+
+and a tensor of interleaved projections of values and keys following the layout:
+(seq_length, batch_size, num_heads * head_dim * 2)
+
+the equivalent code would be:
+q_proj = mx.nd.transpose(queries, axes=(1, 2, 0, 3))
+q_proj = mx.nd.reshape(q_proj, shape=(-1, 0, 0), reverse=True)
+q_proj = mx.nd.contrib.div_sqrt_dim(q_proj)
+tmp = mx.nd.reshape(keys_values, shape=(0, 0, num_heads, 2, -1))
+k_proj = mx.nd.transpose(tmp[:,:,:,0,:], axes=(1, 2, 0, 3))
+k_proj = mx.nd.reshap(k_proj, shape=(-1, 0, 0), reverse=True)
+output = mx.nd.batch_dot(q_proj, k_proj, transpose_b=True)
+
+This Op is GPU only
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InterleavedMatMulParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"queries", "keys_values"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", InterleavedMatMulEncDecQKShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<nnvm::FGradient>("FGradient",
+    ElemwiseGradUseIn{"_backward_interleaved_matmul_encdec_qk"})
+.add_argument("queries", "NDArray-or-Symbol", "Queries")
+.add_argument("keys_values", "NDArray-or-Symbol", "Keys and values interleaved")
+.add_arguments(InterleavedMatMulParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_interleaved_matmul_encdec_qk)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<InterleavedMatMulParam>);
+
+NNVM_REGISTER_OP(_contrib_interleaved_matmul_encdec_valatt)
+.describe(R"code(Compute the matrix multiplication between the projections of
+values and the attention weights in multihead attention use as encoder-decoder.
+
+the inputs must be a tensor of interleaved projections of
+keys and values following the layout:
+(seq_length, batch_size, num_heads * head_dim * 2)
+
+and the attention weights following the layout:
+(batch_size, seq_length, seq_length)
+
+the equivalent code would be:
+
+tmp = mx.nd.reshape(queries_keys_values, shape=(0, 0, num_heads, 3, -1))
+v_proj = mx.nd.transpose(tmp[:,:,:,1,:], axes=(1, 2, 0, 3))
+v_proj = mx.nd.reshape(v_proj, shape=(-1, 0, 0), reverse=True)
+output = mx.nd.batch_dot(attention, v_proj, transpose_b=True)
+output = mx.nd.reshape(output, shape=(-1, num_heads, 0, 0), reverse=True)
+output = mx.nd.transpose(output, axes=(0, 2, 1, 3))
+output = mx.nd.reshape(output, shape=(0, 0, -1))
+
+This Op is GPU only
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InterleavedMatMulParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"keys_values", "attention"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", InterleavedMatMulEncDecValAttShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<nnvm::FGradient>("FGradient",
+    ElemwiseGradUseIn{"_backward_interleaved_matmul_encdec_valatt"})
+.add_argument("keys_values", "NDArray-or-Symbol", "Keys and values interleaved")
+.add_argument("attention", "NDArray-or-Symbol", "Attention maps")
+.add_arguments(InterleavedMatMulParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_interleaved_matmul_encdec_valatt)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<InterleavedMatMulParam>);
+
+
 // relu
 MXNET_OPERATOR_REGISTER_UNARY(_contrib_div_sqrt_dim)
 .describe(R"code(Rescale the input by the square root of the channel dimension.
diff --git a/src/operator/contrib/transformer.cu b/src/operator/contrib/transformer.cu
index 6ed073db6011..e152669478dd 100644
--- a/src/operator/contrib/transformer.cu
+++ b/src/operator/contrib/transformer.cu
@@ -22,12 +22,572 @@
  * \file transformer.cu
  * \brief GPU implementation of the operators used in Transformer
  */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+
 #include <mxnet/base.h>
 #include "./transformer-inl.h"
+#include "../../common/cuda_utils.h"
 
 namespace mxnet {
 namespace op {
 
+// Approach in gemm_switch_fp32accum is coming from MLPerf v0.6 submission repository from NVIDIA
+// by https://github.com/kevinstephano
+template<typename DType>
+void CublasStridedBatchedGemm(mshadow::Stream<gpu>* s, bool transA, bool transB,
+                              int32_t m, int32_t n, int32_t k,
+                              float alpha, const DType* a, int32_t lda, int32_t strideA,
+                              const DType *b, int32_t ldb, int32_t strideB, float beta,
+                              DType *c, int32_t ldc, int32_t strideC, int32_t batchCount,
+                              cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
+#if CUDA_VERSION >= 9010
+  using namespace mxnet::common::cuda;
+  CHECK_EQ(s->blas_handle_ownership_, mshadow::Stream<gpu>::OwnHandle)
+      << "Must init CuBLAS handle in stream";
+
+  cublasHandle_t blas_handle = mshadow::Stream<gpu>::GetBlasHandle(s);
+  auto err = CUBLAS_STATUS_SUCCESS;
+  // TODO(cfujitsang): handle computation_precision
+  err = cublasGemmStridedBatchedEx(
+      blas_handle, CublasTransposeOp(transA), CublasTransposeOp(transB),
+      static_cast<int>(m), static_cast<int>(n), static_cast<int>(k),
+      reinterpret_cast<void*>(&alpha),
+      a, CublasType<DType>::kCudaFlag, static_cast<int>(lda), strideA,
+      b, CublasType<DType>::kCudaFlag, static_cast<int>(ldb), strideB,
+      reinterpret_cast<void*>(&beta),
+      c, CublasType<DType>::kCudaFlag, static_cast<int>(ldc), strideC,
+      static_cast<int>(batchCount), CUDA_R_32F, algo);
+  CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas gemmEx fail.";
+#else
+  LOG(FATAL) << "Not implemented with CUDA < 9.1";
+#endif
+}
+
+template<typename DType>
+void gemm_switch_fp32accum(mshadow::Stream<gpu>* s, bool transA, bool transB,
+                           int32_t m, int32_t n, int32_t k,
+                           float alpha, const DType *a, int32_t lda,
+                           int32_t strideA, const DType *b, int32_t ldb,
+                           int32_t strideB, float beta, DType *c, int32_t ldc,
+                           int32_t strideC, int32_t batchCount) {
+  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+  if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x7)) {
+    CublasStridedBatchedGemm(s, transA, transB, m, n, k, alpha, a, lda, strideA, b, ldb,
+      strideB, beta, c, ldc, strideC, batchCount, CUBLAS_GEMM_ALGO0_TENSOR_OP);
+  } else {
+    CublasStridedBatchedGemm(s, transA, transB, m, n, k, alpha, a, lda, strideA, b, ldb,
+      strideB, beta, c, ldc, strideC, batchCount);
+  }
+  CHECK_CUDA_ERROR("Error at InterleavedMatMul");
+}
+
+// TODO(cfujitsang): use scale as optional ?
+void InterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
+                                   const OpContext &ctx,
+                                   const std::vector<TBlob> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<TBlob> &outputs) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const DType* queries_keys_values = inputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* output = outputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const int32_t qkv_seq_len    = inputs[0].shape_[0];
+    const int32_t sequences      = inputs[0].shape_[1];
+    const int32_t output_lin_dim = inputs[0].shape_[2];
+    const int32_t embed_dim      = output_lin_dim / 3;
+    const int32_t head_dim       = embed_dim / params.heads;
+    const int32_t attn_batches   = params.heads * sequences;
+    const int32_t lead_dim       = attn_batches * 3 * head_dim;
+    const int32_t batch_stride   = 3 * head_dim;
+    const float beta             = req[0] == kAddTo ? 1.f : 0.f;
+    const float scale            = 1.0 / sqrt(static_cast<float>(head_dim));
+
+    if (req[0] == kNullOp)
+      return;
+
+    gemm_switch_fp32accum(s,
+                          true,
+                          false,
+                          qkv_seq_len,
+                          qkv_seq_len,
+                          head_dim,
+                          scale,
+                          queries_keys_values + head_dim,
+                          lead_dim,
+                          batch_stride,
+                          queries_keys_values,
+                          lead_dim,
+                          batch_stride,
+                          beta,
+                          output,
+                          qkv_seq_len,
+                          qkv_seq_len * qkv_seq_len,
+                          attn_batches);
+  })
+}
+
+void BackwardInterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
+                                           const OpContext &ctx,
+                                           const std::vector<TBlob> &inputs,
+                                           const std::vector<OpReqType> &req,
+                                           const std::vector<TBlob> &outputs) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const DType* output_grads        = inputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* queries_keys_values = inputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* queries_keys_values_grads = outputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const int32_t qkv_seq_len    = inputs[1].shape_[0];
+    const int32_t sequences      = inputs[1].shape_[1];
+    const int32_t output_lin_dim = inputs[1].shape_[2];
+    const int32_t embed_dim      = output_lin_dim / 3;
+    const int32_t head_dim       = embed_dim / params.heads;
+    const int32_t attn_batches   = params.heads * sequences;
+    const int32_t lead_dim       = attn_batches * 3 * head_dim;
+    const int32_t batch_stride   = 3 * head_dim;
+    const float scale            = 1.0 / sqrt(static_cast<float>(head_dim));
+    const float beta = req[0] == kAddTo ? 1.f : 0.f;
+
+    if (req[0] == kNullOp)
+      return;
+
+    if (req[0] == kWriteTo) {
+      cudaMemsetAsync(queries_keys_values_grads, 0, outputs[0].shape_.Size() * sizeof(DType),
+                      mshadow::Stream<gpu>::GetStream(s));
+    }
+
+    gemm_switch_fp32accum(s,
+                          false,
+                          false,
+                          head_dim,
+                          qkv_seq_len,
+                          qkv_seq_len,
+                          scale,
+                          queries_keys_values + head_dim,
+                          lead_dim,
+                          batch_stride,
+                          output_grads,
+                          qkv_seq_len,
+                          qkv_seq_len * qkv_seq_len,
+                          beta,
+                          queries_keys_values_grads,
+                          lead_dim,
+                          batch_stride,
+                          attn_batches);
+    gemm_switch_fp32accum(s,
+                          false,
+                          true,
+                          head_dim,
+                          qkv_seq_len,
+                          qkv_seq_len,
+                          scale,
+                          queries_keys_values,
+                          lead_dim,
+                          batch_stride,
+                          output_grads,
+                          qkv_seq_len,
+                          qkv_seq_len * qkv_seq_len,
+                          beta,
+                          queries_keys_values_grads + head_dim,
+                          lead_dim,
+                          batch_stride,
+                          attn_batches);
+  })
+}
+
+void InterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
+                                       const OpContext &ctx,
+                                       const std::vector<TBlob> &inputs,
+                                       const std::vector<OpReqType> &req,
+                                       const std::vector<TBlob> &outputs) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const DType* queries_keys_values = inputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* attention_maps      = inputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* output                    = outputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const int32_t qkv_seq_len    = inputs[0].shape_[0];
+    const int32_t sequences      = inputs[0].shape_[1];
+    const int32_t output_lin_dim = inputs[0].shape_[2];
+    const int32_t embed_dim      = output_lin_dim / 3;
+    const int32_t head_dim       = embed_dim / params.heads;
+    const int32_t attn_batches   = params.heads * sequences;
+    const int32_t lead_dim       = attn_batches * 3 * head_dim;
+    const int32_t batch_stride   = 3 * head_dim;
+    const float alpha            = 1.f;
+    const float beta             = req[0] == kAddTo ? 1.f : 0.f;
+
+    if (req[0] == kNullOp)
+      return;
+
+    gemm_switch_fp32accum(s,
+                          false,
+                          false,
+                          head_dim,
+                          qkv_seq_len,
+                          qkv_seq_len,
+                          alpha,
+                          queries_keys_values + 2 * head_dim,
+                          lead_dim,
+                          batch_stride,
+                          attention_maps,
+                          qkv_seq_len,
+                          qkv_seq_len * qkv_seq_len,
+                          beta,
+                          output,
+                          head_dim * attn_batches,
+                          head_dim,
+                          attn_batches);
+  })
+}
+
+void BackwardInterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
+                                               const OpContext &ctx,
+                                               const std::vector<TBlob> &inputs,
+                                               const std::vector<OpReqType> &req,
+                                               const std::vector<TBlob> &outputs) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const DType* output_grads              = inputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* queries_keys_values       = inputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* attention_maps            = inputs[2].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* queries_keys_values_grads       = outputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* attention_maps_grads            = outputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    const int32_t qkv_seq_len    = inputs[1].shape_[0];
+    const int32_t sequences      = inputs[1].shape_[1];
+    const int32_t output_lin_dim = inputs[1].shape_[2];
+    const int32_t embed_dim      = output_lin_dim / 3;
+    const int32_t head_dim       = embed_dim / params.heads;
+    const int32_t attn_batches   = params.heads * sequences;
+    const int32_t lead_dim       = attn_batches * 3 * head_dim;
+    const int32_t batch_stride   = 3 * head_dim;
+    const float alpha            = 1.f;
+    if (req[0] != kNullOp) {
+      if (req[0] == kWriteTo) {
+        cudaMemsetAsync(queries_keys_values_grads, 0, outputs[0].shape_.Size() * sizeof(DType),
+                        mshadow::Stream<gpu>::GetStream(s));
+      }
+      const float beta = req[0] == kAddTo ? 1.f : 0.f;
+      gemm_switch_fp32accum(s,
+                            false,
+                            true,
+                            head_dim,
+                            qkv_seq_len,
+                            qkv_seq_len,
+                            alpha,
+                            output_grads,
+                            head_dim * attn_batches,
+                            head_dim,
+                            attention_maps,
+                            qkv_seq_len,
+                            qkv_seq_len * qkv_seq_len,
+                            beta,
+                            queries_keys_values_grads + 2 * head_dim,
+                            lead_dim,
+                            batch_stride,
+                            attn_batches);
+    }
+    if (req[1] != kNullOp) {
+      const float beta = req[1] == kAddTo ? 1.f : 0.f;
+      gemm_switch_fp32accum(s,
+                            true,
+                            false,
+                            qkv_seq_len,
+                            qkv_seq_len,
+                            head_dim,
+                            alpha,
+                            queries_keys_values + 2 * head_dim,
+                            lead_dim,
+                            batch_stride,
+                            output_grads,
+                            head_dim * attn_batches,
+                            head_dim,
+                            beta,
+                            attention_maps_grads,
+                            qkv_seq_len,
+                            qkv_seq_len * qkv_seq_len,
+                            attn_batches);
+    }
+  })
+}
+
+
+void InterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
+                                  const OpContext &ctx,
+                                  const std::vector<TBlob> &inputs,
+                                  const std::vector<OpReqType> &req,
+                                  const std::vector<TBlob> &outputs) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const DType* queries     = inputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* keys_values = inputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* output            = outputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const int32_t q_seq_len         = inputs[0].shape_[0];
+    const int32_t sequences         = inputs[0].shape_[1];
+    const int32_t output_lin_q_dim  = inputs[0].shape_[2];
+    const int32_t kv_seq_len        = inputs[1].shape_[0];
+    const int32_t output_lin_kv_dim = inputs[1].shape_[2];
+    const int32_t embed_dim         = output_lin_q_dim;
+    const int32_t head_dim          = embed_dim / params.heads;
+    const int32_t attn_batches      = params.heads * sequences;
+    const int32_t lead_dim_q        = attn_batches * head_dim;
+    const int32_t lead_dim_kv       = attn_batches * 2 * head_dim;
+    const int32_t batch_stride_q    = head_dim;
+    const int32_t batch_stride_kv   = head_dim * 2;
+    const float beta                = req[0] == kAddTo ? 1.f : 0.f;
+    const float scale               = 1.f / sqrt(static_cast<float>(head_dim));
+
+    if (req[0] == kNullOp)
+      return;
+
+    gemm_switch_fp32accum(s,
+                          true,
+                          false,
+                          kv_seq_len,
+                          q_seq_len,
+                          head_dim,
+                          scale,
+                          keys_values,
+                          lead_dim_kv,
+                          batch_stride_kv,
+                          queries,
+                          lead_dim_q,
+                          batch_stride_q,
+                          beta,
+                          output,
+                          kv_seq_len,
+                          kv_seq_len * q_seq_len,
+                          attn_batches);
+  })
+}
+
+void BackwardInterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
+                                          const OpContext &ctx,
+                                          const std::vector<TBlob> &inputs,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<TBlob> &outputs) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const DType* output_grads = inputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* queries       = inputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* keys_values   = inputs[2].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* queries_grads       = outputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* keys_values_grads   = outputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    const int32_t q_seq_len         = inputs[1].shape_[0];
+    const int32_t sequences         = inputs[1].shape_[1];
+    const int32_t output_lin_q_dim  = inputs[1].shape_[2];
+    const int32_t kv_seq_len        = inputs[2].shape_[0];
+    const int32_t output_lin_kv_dim = inputs[2].shape_[2];
+    const int32_t embed_dim         = output_lin_q_dim;
+    const int32_t head_dim          = embed_dim / params.heads;
+    const int32_t attn_batches      = params.heads * sequences;
+    const int32_t lead_dim_q        = attn_batches * head_dim;
+    const int32_t lead_dim_kv       = attn_batches * 2 * head_dim;
+    const int32_t batch_stride_q    = head_dim;
+    const int32_t batch_stride_kv   = head_dim * 2;
+    const float scale               = 1.f / sqrt(static_cast<float>(head_dim));
+
+    if (req[0] != kNullOp) {
+      const float beta = req[0] == kAddTo ? 1.f : 0.f;
+      gemm_switch_fp32accum(s,
+                            false,
+                            false,
+                            head_dim,
+                            q_seq_len,
+                            kv_seq_len,
+                            scale,
+                            keys_values,
+                            lead_dim_kv,
+                            batch_stride_kv,
+                            output_grads,
+                            kv_seq_len,
+                            kv_seq_len * q_seq_len,
+                            beta,
+                            queries_grads,
+                            lead_dim_q,
+                            batch_stride_q,
+                            attn_batches);
+    }
+    if (req[1] != kNullOp) {
+      if (req[1] == kWriteTo) {
+        cudaMemsetAsync(keys_values_grads, 0, outputs[1].shape_.Size() * sizeof(DType),
+                        mshadow::Stream<gpu>::GetStream(s));
+      }
+      const float beta = req[1] == kAddTo ? 1.f : 0.f;
+      gemm_switch_fp32accum(s,
+                            false,
+                            true,
+                            head_dim,
+                            kv_seq_len,
+                            q_seq_len,
+                            scale,
+                            queries,
+                            lead_dim_q,
+                            batch_stride_q,
+                            output_grads,
+                            kv_seq_len,
+                            kv_seq_len * q_seq_len,
+                            beta,
+                            keys_values_grads,
+                            lead_dim_kv,
+                            batch_stride_kv,
+                            attn_batches);
+    }
+  })
+}
+
+void InterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
+                                      const OpContext &ctx,
+                                      const std::vector<TBlob> &inputs,
+                                      const std::vector<OpReqType> &req,
+                                      const std::vector<TBlob> &outputs) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const DType* keys_values    = inputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* attention_maps = inputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* output               = outputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const int32_t kv_seq_len        = inputs[0].shape_[0];
+    const int32_t sequences         = inputs[0].shape_[1];
+    const int32_t output_lin_kv_dim = inputs[0].shape_[2];
+    const int32_t attn_batches      = inputs[1].shape_[0];
+    const int32_t q_seq_len         = inputs[1].shape_[1];
+    const int32_t embed_dim         = output_lin_kv_dim / 2;
+    int32_t head_dim                = embed_dim / params.heads;
+    const int32_t lead_dim_kv       = attn_batches * head_dim * 2;
+    const int32_t batch_stride_kv   = 2 * head_dim;
+    const float alpha               = 1.f;
+    const float beta                = req[0] == kAddTo ? 1.f : 0.f;
+
+    if (req[0] == kNullOp)
+      return;
+
+    gemm_switch_fp32accum(s,
+                          false,
+                          false,
+                          head_dim,
+                          q_seq_len,
+                          kv_seq_len,
+                          alpha,
+                          keys_values + head_dim,
+                          lead_dim_kv,
+                          batch_stride_kv,
+                          attention_maps,
+                          kv_seq_len,
+                          kv_seq_len * q_seq_len,
+                          beta,
+                          output,
+                          head_dim * attn_batches,
+                          head_dim,
+                          attn_batches);
+  })
+}
+
+void BackwardInterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
+                                              const OpContext &ctx,
+                                              const std::vector<TBlob> &inputs,
+                                              const std::vector<OpReqType> &req,
+                                              const std::vector<TBlob> &outputs) {
+  const auto& params = nnvm::get<InterleavedMatMulParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const DType* output_grads   = inputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* keys_values    = inputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    const DType* attention_maps = inputs[2].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* keys_values_grads    = outputs[0].FlatTo2D<gpu, DType>(s).dptr_;
+    DType* attention_maps_grads = outputs[1].FlatTo2D<gpu, DType>(s).dptr_;
+    const int32_t kv_seq_len        = inputs[1].shape_[0];
+    const int32_t sequences         = inputs[1].shape_[1];
+    const int32_t output_lin_kv_dim = inputs[1].shape_[2];
+    const int32_t attn_batches      = inputs[2].shape_[0];
+    const int32_t q_seq_len         = inputs[2].shape_[1];
+    const int32_t embed_dim         = output_lin_kv_dim / 2;
+    int32_t head_dim                = embed_dim / params.heads;
+    const int32_t lead_dim_kv       = attn_batches * head_dim * 2;
+    const int32_t batch_stride_kv   = 2 * head_dim;
+    const float alpha               = 1.f;
+
+    if (req[0] != kNullOp) {
+      if (req[0] == kWriteTo) {
+        cudaMemsetAsync(keys_values_grads, 0, outputs[0].shape_.Size() * sizeof(DType),
+                        mshadow::Stream<gpu>::GetStream(s));
+      }
+      const float beta = req[0] == kAddTo ? 1.f : 0.f;
+      gemm_switch_fp32accum(s,
+                            false,
+                            true,
+                            head_dim,
+                            kv_seq_len,
+                            q_seq_len,
+                            alpha,
+                            output_grads,
+                            head_dim * attn_batches,
+                            head_dim,
+                            attention_maps,
+                            kv_seq_len,
+                            kv_seq_len * q_seq_len,
+                            beta,
+                            keys_values_grads + head_dim,
+                            lead_dim_kv,
+                            batch_stride_kv,
+                            attn_batches);
+    }
+    if (req[1] != kNullOp) {
+      const float beta = req[1] == kAddTo ? 1.f : 0.f;
+      gemm_switch_fp32accum(s,
+                            true,
+                            false,
+                            kv_seq_len,
+                            q_seq_len,
+                            head_dim,
+                            alpha,
+                            keys_values + head_dim,
+                            lead_dim_kv,
+                            batch_stride_kv,
+                            output_grads,
+                            head_dim * attn_batches,
+                            head_dim,
+                            beta,
+                            attention_maps_grads,
+                            kv_seq_len,
+                            kv_seq_len * q_seq_len,
+                            attn_batches);
+    }
+  })
+}
+
+NNVM_REGISTER_OP(_contrib_interleaved_matmul_selfatt_qk)
+.set_attr<FCompute>("FCompute<gpu>", InterleavedMatMulSelfAttQKGPU);
+
+NNVM_REGISTER_OP(_contrib_interleaved_matmul_selfatt_valatt)
+.set_attr<FCompute>("FCompute<gpu>", InterleavedMatMulSelfAttValAttGPU);
+
+NNVM_REGISTER_OP(_contrib_interleaved_matmul_encdec_qk)
+.set_attr<FCompute>("FCompute<gpu>", InterleavedMatMulEncDecQKGPU);
+
+NNVM_REGISTER_OP(_contrib_interleaved_matmul_encdec_valatt)
+.set_attr<FCompute>("FCompute<gpu>", InterleavedMatMulEncDecValAttGPU);
+
+NNVM_REGISTER_OP(_backward_interleaved_matmul_selfatt_qk)
+.set_attr<FCompute>("FCompute<gpu>", BackwardInterleavedMatMulSelfAttQKGPU);
+
+NNVM_REGISTER_OP(_backward_interleaved_matmul_selfatt_valatt)
+.set_attr<FCompute>("FCompute<gpu>", BackwardInterleavedMatMulSelfAttValAttGPU);
+
+NNVM_REGISTER_OP(_backward_interleaved_matmul_encdec_qk)
+.set_attr<FCompute>("FCompute<gpu>", BackwardInterleavedMatMulEncDecQKGPU);
+
+NNVM_REGISTER_OP(_backward_interleaved_matmul_encdec_valatt)
+.set_attr<FCompute>("FCompute<gpu>", BackwardInterleavedMatMulEncDecValAttGPU);
+
 // relu
 NNVM_REGISTER_OP(_contrib_div_sqrt_dim)
 .set_attr<FCompute>("FCompute<gpu>", DivSqrtDimForward_<gpu>);
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 8b6928a2aa39..fe74eed727e5 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2502,13 +2502,327 @@ def test_arange_like_dtype():
         x = mx.sym.Variable('x', dtype=t)
         y = mx.sym.reshape(x, shape=(0, 0, -1))
         z = mx.sym.contrib.arange_like(y, axis=-1)
-    
+
         mod = z.simple_bind(ctx=mx.gpu(0), x=(3, 4, 5, 6), grad_req='null')
         mod.arg_arrays[0][:] = np.random.normal(size=mod.arg_arrays[0].shape).astype(t)
         out = mod.forward(is_train=False)
         for v in out:
             assert v.dtype == t
 
+@with_seed()
+def check_multihead_attention_selfatt(dtype):
+    def convert_weight(F, q_weight, k_weight, v_weight, num_heads):
+        q_weight = F.reshape(q_weight, shape=(num_heads, -1, 0), reverse=True)
+        k_weight = F.reshape(k_weight, shape=(num_heads, -1, 0), reverse=True)
+        v_weight = F.reshape(v_weight, shape=(num_heads, -1, 0), reverse=True)
+        all_weights = F.concat(q_weight, k_weight, v_weight, dim=-2)
+        all_weights = F.reshape(all_weights, shape=(-1, 0), reverse=True)
+        return all_weights
+
+    def convert_bias(F, q_bias, k_bias, v_bias, num_heads):
+        q_bias = F.reshape(q_bias, shape=(num_heads, -1))
+        k_bias = F.reshape(k_bias, shape=(num_heads, -1))
+        v_bias = F.reshape(v_bias, shape=(num_heads, -1))
+        all_bias = F.stack(q_bias, k_bias, v_bias, axis=1)
+        all_bias = F.reshape(all_bias, shape=(-1,))
+        return all_bias
+
+    batch_size = 2
+    qkv_length = 7  # length of a sequence
+    qkv_dim = 9     # dimension of encoding
+    num_heads = 3   # number of attention head
+    head_dim = 5    # head size
+    out_dim = 13 * num_heads
+    qkv_units = num_heads * head_dim
+
+    arg_params = {
+        'qkv': mx.nd.array(np.random.rand(*(batch_size, qkv_length, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'q_weight': mx.nd.array(np.random.rand(*(qkv_units, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'k_weight': mx.nd.array(np.random.rand(*(qkv_units, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'v_weight': mx.nd.array(np.random.rand(*(qkv_units, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'q_bias': mx.nd.array(np.random.rand(*(qkv_units,)).astype(dtype) * 0.1, dtype=dtype),
+        'k_bias': mx.nd.array(np.random.rand(*(qkv_units,)).astype(dtype) * 0.1, dtype=dtype),
+        'v_bias': mx.nd.array(np.random.rand(*(qkv_units,)).astype(dtype) * 0.1, dtype=dtype),
+        'out_weight': mx.nd.array(np.random.rand(*(out_dim, qkv_units)).astype(dtype) * 0.1, dtype=dtype),
+        'out_bias': mx.nd.array(np.random.rand(*(out_dim,)).astype(dtype) * 0.1, dtype=dtype),
+        }
+
+    qkv = mx.sym.Variable('qkv')
+    sonde = mx.sym.Variable('sonde')
+    q_weight = mx.sym.Variable('q_weight')
+    k_weight = mx.sym.Variable('k_weight')
+    v_weight = mx.sym.Variable('v_weight')
+    q_bias = mx.sym.Variable('q_bias')
+    k_bias = mx.sym.Variable('k_bias')
+    v_bias = mx.sym.Variable('v_bias')
+    out_weight = mx.sym.Variable('out_weight')
+    out_bias = mx.sym.Variable('out_bias')
+    qkv_weight = convert_weight(mx.sym, q_weight, k_weight, v_weight, num_heads)
+    qkv_bias = convert_bias(mx.sym, q_bias, k_bias, v_bias, num_heads)
+    qkv = mx.sym.transpose(qkv, axes=(1, 0, 2))
+    qkv_proj = mx.sym.FullyConnected(qkv, weight=qkv_weight, bias=qkv_bias, flatten=False,
+                                     num_hidden=qkv_units * 3, no_bias=False)
+    att_score = mx.sym.contrib.interleaved_matmul_selfatt_qk(
+            qkv_proj, heads=num_heads)
+    att_score = att_score + sonde
+    weighted_value = mx.sym.contrib.interleaved_matmul_selfatt_valatt(
+            qkv_proj, att_score, heads=num_heads)
+    output = mx.sym.FullyConnected(weighted_value, weight=out_weight, bias=out_bias, flatten=False,
+                                   num_hidden=out_dim, no_bias=False)
+    output = mx.sym.transpose(output, axes=(1, 0, 2))
+    output = mx.sym.Group([output, att_score])
+    executor = output.simple_bind(ctx=mx.gpu(0),
+                                  qkv=(batch_size, qkv_length, qkv_dim),
+                                  q_weight=(qkv_units, qkv_dim),
+                                  q_bias=(qkv_units,),
+                                  k_weight=(qkv_units, qkv_dim),
+                                  k_bias=(qkv_units,),
+                                  v_weight=(qkv_units, qkv_dim),
+                                  v_bias=(qkv_units,),
+                                  type_dict={'qkv': dtype,
+                                             'q_weight': dtype,
+                                             'k_weight': dtype,
+                                             'v_weight': dtype,
+                                             'q_bias': dtype,
+                                             'k_bias': dtype,
+                                             'v_bias': dtype,
+                                             'sonde': dtype},
+                                  grad_req='write', force_rebind=True)
+    output_shape = executor.outputs[0].shape
+    output_grads = np.random.rand(*output_shape).astype(dtype) * 0.1
+    executor.copy_params_from(arg_params, {})
+    executor.arg_dict['sonde'][:] = 0.
+    executor.arg_dict['sonde'].wait_to_read()
+    executor.forward(is_train=True)
+    output_opti = executor.outputs[0].asnumpy()
+    att_score_opti = executor.outputs[1].asnumpy()
+    executor.backward([mx.nd.array(output_grads, dtype=dtype),
+                       mx.nd.zeros(att_score_opti.shape, dtype=dtype)])
+    grads_opti = {k: v.asnumpy() for k, v in executor.grad_dict.items()}
+    qkv = mx.sym.Variable('qkv')
+    sonde = mx.sym.Variable('sonde')
+    q_weight = mx.sym.Variable('q_weight')
+    k_weight = mx.sym.Variable('k_weight')
+    v_weight = mx.sym.Variable('v_weight')
+    q_bias = mx.sym.Variable('q_bias')
+    k_bias = mx.sym.Variable('k_bias')
+    v_bias = mx.sym.Variable('v_bias')
+    out_weight = mx.sym.Variable('out_weight')
+    out_bias = mx.sym.Variable('out_bias')
+
+    q = mx.sym.FullyConnected(qkv, weight=q_weight, bias=q_bias, flatten=False,
+                              num_hidden=qkv_units, no_bias=False)
+    k = mx.sym.FullyConnected(qkv, weight=k_weight, bias=k_bias, flatten=False,
+                              num_hidden=qkv_units, no_bias=False)
+    v = mx.sym.FullyConnected(qkv, weight=v_weight, bias=v_bias, flatten=False,
+                              num_hidden=qkv_units, no_bias=False)
+    q = mx.sym.reshape(q, shape=(0, 0, num_heads, -1))
+    q = mx.sym.transpose(q, axes=(0, 2, 1, 3))
+    q = mx.sym.reshape(q, shape=(-1, 0, 0), reverse=True)
+    k = mx.sym.reshape(k, shape=(0, 0, num_heads, -1))
+    k = mx.sym.transpose(k, axes=(0, 2, 1, 3))
+    k = mx.sym.reshape(k, shape=(-1, 0, 0), reverse=True)
+    q = mx.sym.contrib.div_sqrt_dim(q)
+    att_score = mx.sym.batch_dot(q, k, transpose_b=True)
+    att_score = att_score + sonde
+    v = mx.sym.reshape(v, shape=(0, 0, num_heads, -1))
+    v = mx.sym.transpose(v, axes=(0, 2, 1, 3))
+    v = mx.sym.reshape(v, shape=(-1, 0, 0), reverse=True)
+    weighted_value = mx.sym.batch_dot(att_score, v)
+    weighted_value = mx.sym.reshape(weighted_value, shape=(-1, num_heads, 0, 0),
+                                    reverse=True)
+    weighted_value = mx.sym.transpose(weighted_value, axes=(0, 2, 1, 3))
+    weighted_value = mx.sym.reshape(weighted_value, shape=(0, 0, -1))
+    output = mx.sym.FullyConnected(weighted_value, weight=out_weight, bias=out_bias, flatten=False,
+                                   num_hidden=out_dim, no_bias=False)
+    output = mx.sym.Group([output, att_score])
+    executor = output.simple_bind(ctx=mx.gpu(0),
+                                  qkv=(batch_size, qkv_length, qkv_dim),
+                                  type_dict={'qkv': dtype},
+                                  grad_req='write', force_rebind=True)
+    executor.copy_params_from(arg_params, {})
+    executor.arg_dict['sonde'][:] = 0.
+    executor.arg_dict['sonde'].wait_to_read()
+    executor.forward(is_train=True)
+    output_orig = executor.outputs[0].asnumpy()
+    att_score_orig = executor.outputs[1].asnumpy()
+    executor.backward([mx.nd.array(output_grads, dtype=dtype),
+                       mx.nd.zeros(att_score_orig.shape, dtype=dtype)])
+    grads_orig = {k : v.asnumpy() for k, v in executor.grad_dict.items()}
+    assert_allclose(att_score_orig, att_score_opti, rtol=1e-2, atol=1e-3)
+    assert_allclose(output_orig, output_opti, rtol=1e-2, atol=1e-3)
+
+    for k in grads_opti.keys():
+        assert(grads_orig[k].dtype == grads_opti[k].dtype)
+        assert(grads_orig[k].shape == grads_opti[k].shape)
+        assert_allclose(grads_orig[k], grads_opti[k], rtol=1e-2, atol=1e-3)
+
+def test_multihead_attention_selfatt():
+    for dtype in ['float16', 'float32']:
+        check_multihead_attention_selfatt(dtype=dtype)
+
+def check_multihead_attention_encdec(dtype):
+    def convert_weight(F, k_weight, v_weight, num_heads):
+        k_weight = F.reshape(k_weight, shape=(num_heads, -1, 0), reverse=True)
+        v_weight = F.reshape(v_weight, shape=(num_heads, -1, 0), reverse=True)
+        all_weights = F.concat(k_weight, v_weight, dim=-2)
+        all_weights = F.reshape(all_weights, shape=(-1, 0), reverse=True)
+        return all_weights
+
+    def convert_bias(F, k_bias, v_bias, num_heads):
+        k_bias = F.reshape(k_bias, shape=(num_heads, -1))
+        v_bias = F.reshape(v_bias, shape=(num_heads, -1))
+        all_bias = F.stack(k_bias, v_bias, axis=1)
+        all_bias = F.reshape(all_bias, shape=(-1,))
+        return all_bias
+
+    batch_size = 2
+    qkv_length = 7  # length of a sequence
+    qkv_dim = 9     # dimension of encoding
+    num_heads = 3   # number of attention head
+    head_dim = 5    # head size
+    out_dim = 13 * num_heads
+    qkv_units = num_heads * head_dim
+
+    arg_params = {
+        'q': mx.nd.array(np.random.rand(*(batch_size, qkv_length, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'kv': mx.nd.array(np.random.rand(*(batch_size, qkv_length, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'q_weight': mx.nd.array(np.random.rand(*(qkv_units, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'k_weight': mx.nd.array(np.random.rand(*(qkv_units, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'v_weight': mx.nd.array(np.random.rand(*(qkv_units, qkv_dim)).astype(dtype) * 0.1, dtype=dtype),
+        'q_bias': mx.nd.array(np.random.rand(*(qkv_units,)).astype(dtype) * 0.1, dtype=dtype),
+        'k_bias': mx.nd.array(np.random.rand(*(qkv_units,)).astype(dtype) * 0.1, dtype=dtype),
+        'v_bias': mx.nd.array(np.random.rand(*(qkv_units,)).astype(dtype) * 0.1, dtype=dtype),
+        'out_weight': mx.nd.array(np.random.rand(*(out_dim, qkv_units)).astype(dtype) * 0.1, dtype=dtype),
+        'out_bias': mx.nd.array(np.random.rand(*(out_dim,)).astype(dtype) * 0.1, dtype=dtype),
+        }
+
+    q = mx.sym.Variable('q')
+    kv = mx.sym.Variable('kv')
+    sonde = mx.sym.Variable('sonde')
+    q_weight = mx.sym.Variable('q_weight')
+    k_weight = mx.sym.Variable('k_weight')
+    v_weight = mx.sym.Variable('v_weight')
+    q_bias = mx.sym.Variable('q_bias')
+    k_bias = mx.sym.Variable('k_bias')
+    v_bias = mx.sym.Variable('v_bias')
+    out_weight = mx.sym.Variable('out_weight')
+    out_bias = mx.sym.Variable('out_bias')
+    kv_weight = convert_weight(mx.sym, k_weight, v_weight, num_heads)
+    kv_bias = convert_bias(mx.sym, k_bias, v_bias, num_heads)
+    kv = mx.sym.transpose(kv, axes=(1, 0, 2))
+    kv_proj = mx.sym.FullyConnected(kv, weight=kv_weight, bias=kv_bias, flatten=False,
+                                    num_hidden=qkv_units * 2, no_bias=False)
+    q = mx.sym.transpose(q, axes=(1, 0, 2))
+    q_proj = mx.sym.FullyConnected(q, weight=q_weight, bias=q_bias, flatten=False,
+                                   num_hidden=qkv_units, no_bias=False)
+    att_score = mx.sym.contrib.interleaved_matmul_encdec_qk(
+            q_proj, kv_proj, heads=num_heads) 
+    att_score = att_score + sonde
+    weighted_value = mx.sym.contrib.interleaved_matmul_encdec_valatt(
+            kv_proj, att_score, heads=num_heads)
+    output = mx.sym.FullyConnected(weighted_value, weight=out_weight, bias=out_bias, flatten=False,
+                                   num_hidden=out_dim, no_bias=False)
+    output = mx.sym.transpose(output, axes=(1, 0, 2))
+    output = mx.sym.Group([output, att_score])
+    executor = output.simple_bind(ctx=mx.gpu(0),
+                                  q=(batch_size, qkv_length, qkv_dim),
+                                  kv=(batch_size, qkv_length, qkv_dim),
+                                  q_weight=(qkv_units, qkv_dim),
+                                  q_bias=(qkv_units,),
+                                  k_weight=(qkv_units, qkv_dim),
+                                  k_bias=(qkv_units,),
+                                  v_weight=(qkv_units, qkv_dim),
+                                  v_bias=(qkv_units,),
+                                  out_weight=(out_dim, qkv_units),
+                                  out_bias=(out_dim,),
+                                  type_dict={'q': dtype,
+                                             'kv': dtype,
+                                             'q_weight': dtype,
+                                             'q_bias': dtype,
+                                             'k_weight': dtype,
+                                             'k_bias': dtype,
+                                             'v_weight': dtype,
+                                             'v_bias': dtype,
+                                             'out_weight': dtype,
+                                             'out_bias': dtype,
+                                              },
+                                  grad_req='write', force_rebind=True)
+    output_shape = executor.outputs[0].shape
+    output_grads = np.random.rand(*output_shape).astype(dtype) * 0.1
+    executor.copy_params_from(arg_params, {})
+    executor.arg_dict['sonde'][:] = 0.
+    executor.arg_dict['sonde'].wait_to_read()
+    executor.forward(is_train=True)
+    output_opti = executor.outputs[0].asnumpy()
+    att_score_opti = executor.outputs[1].asnumpy()
+    executor.backward([mx.nd.array(output_grads, dtype=dtype), mx.nd.zeros(att_score_opti.shape, dtype=dtype)])
+
+    grads_opti = {k: v.asnumpy() for k, v in executor.grad_dict.items()}
+
+    q = mx.sym.Variable('q')
+    kv = mx.sym.Variable('kv')
+    sonde = mx.sym.Variable('sonde')
+    q_weight = mx.sym.Variable('q_weight')
+    k_weight = mx.sym.Variable('k_weight')
+    v_weight = mx.sym.Variable('v_weight')
+    q_bias = mx.sym.Variable('q_bias')
+    k_bias = mx.sym.Variable('k_bias')
+    v_bias = mx.sym.Variable('v_bias')
+    out_weight = mx.sym.Variable('out_weight')
+    out_bias = mx.sym.Variable('out_bias')
+
+    q = mx.sym.FullyConnected(q, weight=q_weight, bias=q_bias, flatten=False,
+                              num_hidden=qkv_units, no_bias=False)
+    k = mx.sym.FullyConnected(kv, weight=k_weight, bias=k_bias, flatten=False,
+                              num_hidden=qkv_units, no_bias=False)
+    v = mx.sym.FullyConnected(kv, weight=v_weight, bias=v_bias, flatten=False,
+                              num_hidden=qkv_units, no_bias=False)
+    q = mx.sym.reshape(q, shape=(0, 0, num_heads, -1))
+    q = mx.sym.transpose(q, axes=(0, 2, 1, 3))
+    q = mx.sym.reshape(q, shape=(-1, 0, 0), reverse=True)
+    k = mx.sym.reshape(k, shape=(0, 0, num_heads, -1))
+    k = mx.sym.transpose(k, axes=(0, 2, 1, 3))
+    k = mx.sym.reshape(k, shape=(-1, 0, 0), reverse=True)
+    q = mx.sym.contrib.div_sqrt_dim(q)
+    att_score = mx.sym.batch_dot(q, k, transpose_b=True)
+    att_score = att_score + sonde
+    v = mx.sym.reshape(v, shape=(0, 0, num_heads, -1))
+    v = mx.sym.transpose(v, axes=(0, 2, 1, 3))
+    v = mx.sym.reshape(v, shape=(-1, 0, 0), reverse=True)
+    weighted_value = mx.sym.batch_dot(att_score, v)
+    weighted_value = mx.sym.reshape(weighted_value, shape=(-1, num_heads, 0, 0),
+                                    reverse=True)
+    weighted_value = mx.sym.transpose(weighted_value, axes=(0, 2, 1, 3))
+    weighted_value = mx.sym.reshape(weighted_value, shape=(0, 0, -1))
+    output = mx.sym.FullyConnected(weighted_value, weight=out_weight, bias=out_bias, flatten=False,
+                                   num_hidden=out_dim, no_bias=False)
+    output = mx.sym.Group([output, att_score])
+    executor = output.simple_bind(ctx=mx.gpu(0),
+                                  q=(batch_size, qkv_length, qkv_dim),
+                                  kv=(batch_size, qkv_length, qkv_dim),
+                                  type_dict={'q': dtype,
+                                             'kv': dtype},
+                                  grad_req='write', force_rebind=True)
+    executor.copy_params_from(arg_params, {})
+    executor.arg_dict['sonde'][:] = 0.
+    executor.arg_dict['sonde'].wait_to_read()
+    executor.forward(is_train=True)
+    output_orig = executor.outputs[0].asnumpy()
+    att_score_orig = executor.outputs[1].asnumpy()
+    executor.backward([mx.nd.array(output_grads, dtype=dtype), mx.nd.zeros(att_score_orig.shape, dtype=dtype)])
+    grads_orig = {k : v.asnumpy() for k, v in executor.grad_dict.items()}
+    assert_allclose(att_score_orig, att_score_opti, rtol=1e-2, atol=1e-3)
+    assert_allclose(output_orig, output_opti, rtol=1e-2, atol=1e-3)
+
+    for k in grads_opti.keys():
+        assert(grads_orig[k].dtype == grads_opti[k].dtype)
+        assert(grads_orig[k].shape == grads_opti[k].shape)
+        assert_allclose(grads_orig[k], grads_opti[k], rtol=1e-2, atol=1e-3)
+
+def test_multihead_attention_encdec():
+    for dtype in ['float16', 'float32']:
+        check_multihead_attention_encdec(dtype=dtype)
 
 if __name__ == '__main__':
     import nose

From 2df187c6cc2a4953c494a307dd0575784123a24c Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 8 Nov 2019 11:12:52 -0800
Subject: [PATCH 4/4] Removed unrelated test

---
 tests/python/unittest/test_numpy_op.py | 112 -------------------------
 1 file changed, 112 deletions(-)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index d04b97b64342..c1a6ed567b94 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -2871,118 +2871,6 @@ def check_svd(UT, L, V, data_np):
                 check_svd(UT, L, V, data_np)
 
 
-@with_seed()
-@use_np
-def test_np_linalg_cholesky():
-    class TestCholesky(HybridBlock):
-        def __init__(self):
-            super(TestCholesky, self).__init__()
-
-        def hybrid_forward(self, F, data):
-            return F.np.linalg.cholesky(data)
-
-    def get_grad(L):
-        # shape of m is [batch, n, n]
-        if 0 in L.shape:
-            return L
-
-        def copyltu(m):
-            eye = _np.array([_np.eye(m.shape[-1]) for i in range(m.shape[0])])
-            lower = _np.tril(m) - eye * m
-            lower_mask = _np.tril(_np.ones_like(m))
-            ret = lower_mask * m + lower.swapaxes(-1, -2)
-            return ret
-
-        shape = L.shape
-        L = L.reshape(-1, shape[-2], shape[-1])
-        dL = _np.ones_like(L)
-        L_inv = _np.linalg.inv(L)
-        L_inv_T = L_inv.swapaxes(-1, -2)
-        L_T = L.swapaxes(-1, -2)
-        sym_L_inv = 0.5 * (L_inv + L_inv_T)
-        dA = 0.5 * _np.matmul(_np.matmul(L_inv_T, copyltu(_np.matmul(L_T, dL))), L_inv)
-        return dA.reshape(shape)
-
-    def check_cholesky(L, data_np):
-        assert L.shape == data_np.shape
-        # catch error if numpy throws rank < 2
-        try:
-            L_expected = _np.linalg.cholesky(data_np)
-        except Exception as e:
-            print(data_np)
-            print(data_np.shape)
-            print(e)
-        else:
-            assert L.shape == L_expected.shape
-            assert_almost_equal(L.asnumpy(), L_expected, rtol=rtol, atol=atol)
-
-    shapes = [
-        (0, 0),
-        (1, 1),
-        (5, 5),
-        (6, 6),
-        (6, 6, 6),
-        (1, 0, 0),
-        (0, 1, 1),
-        (2, 3, 4, 4),
-    ]
-    dtypes = ['float32', 'float64']
-    for hybridize, dtype, shape in itertools.product([True, False], dtypes, shapes):
-        atol = rtol = 1e-2
-
-        test_cholesky = TestCholesky()
-        if hybridize:
-            test_cholesky.hybridize()
-
-        # Numerical issue:
-        # When backpropagating through Cholesky decomposition, we need to compute the inverse
-        # of L according to dA = 0.5 * L**(-T) * copyLTU(L**T * dL) * L**(-1) where A = LL^T.
-        # The inverse is calculated by "trsm" method in CBLAS. When the data type is float32,
-        # this causes numerical instability. It happens when the matrix is ill-conditioned.
-        # In this example, the issue occurs frequently if the symmetric positive definite input
-        # matrix A is constructed by A = LL^T + \epsilon * I. A proper way of testing such
-        # operators involving numerically unstable operations is to use well-conditioned random
-        # matrices as input. Here we test Cholesky decomposition for FP32 and FP64 separately.
-        # See rocBLAS:
-        # https://github.com/ROCmSoftwarePlatform/rocBLAS/wiki/9.Numerical-Stability-in-TRSM
-
-        # generate symmetric PD matrices
-        if 0 in shape:
-            data_np = np.ones(shape)
-        else:
-            data_np_l = _np.random.uniform(-10., 10., shape)
-            if dtype == 'float32':
-                data_np_l_flat = data_np_l.reshape((-1, shape[-2], shape[-1]))
-            else:
-                data_np_l_flat = _np.tril(data_np_l.reshape((-1, shape[-2], shape[-1])))
-            for i in range(data_np_l_flat.shape[0]):
-                for j in range(data_np_l_flat.shape[-1]):
-                    if data_np_l_flat[i, j, j] < 0:
-                        data_np_l_flat[i, j, j] = -data_np_l_flat[i, j, j]
-                    elif data_np_l_flat[i, j, j] == 0:
-                        data_np_l_flat[i, j, j] = 2
-            data_np = _np.matmul(data_np_l_flat, data_np_l_flat.swapaxes(-1, -2))
-            data_np = data_np.reshape(shape)
-
-        # When dtype is np.FP32, truncation from FP64 to FP32 could also be a source of
-        # instability since the ground-truth gradient is computed using FP64 data.
-        data = np.array(data_np, dtype=dtype)
-        data.attach_grad()
-        with mx.autograd.record():
-            L = test_cholesky(data)
-
-        # check cholesky validity
-        check_cholesky(L, data_np)
-        # check backward. backward does not support empty input
-        if 0 not in L.shape:
-            mx.autograd.backward(L)
-            backward_expected = get_grad(L.asnumpy())
-            assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
-        # check imperative once again
-        L = np.linalg.cholesky(data)
-        check_cholesky(L, data_np)
-
-
 @with_seed()
 @use_np
 def test_np_vstack():