From e139442461b4f82dc149fdc7dc6e009a348f4d9c Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Sat, 2 Nov 2019 22:15:55 -0700
Subject: [PATCH] support mixed-precision true_divide (#16711)

---
 src/common/utils.h                            |  36 +++
 src/operator/leaky_relu-inl.h                 |   3 +-
 src/operator/mshadow_op.h                     |  40 +++
 src/operator/mxnet_op.h                       | 113 +++++++
 src/operator/nn/dropout-inl.h                 |   6 +-
 src/operator/nn/softmax-inl.h                 |   6 +-
 src/operator/numpy/np_true_divide-inl.h       | 284 +++++++++++++++---
 src/operator/numpy/np_true_divide.cc          |  51 +++-
 src/operator/numpy/np_true_divide.cu          |   2 +-
 .../tensor/elemwise_binary_broadcast_op.h     |  57 +++-
 .../tensor/elemwise_binary_scalar_op.h        |   2 +-
 tests/python/unittest/test_numpy_op.py        |  24 +-
 12 files changed, 549 insertions(+), 75 deletions(-)
diff --git a/src/common/utils.h b/src/common/utils.h
index 2b4b821a1835..b919cb301dff 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -842,6 +842,42 @@ inline bool is_float(const int dtype) {
   return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
 }
 
+inline int more_precise_type(const int type1, const int type2) {
+  if (type1 == type2) return type1;
+  if (is_float(type1) && is_float(type2)) {
+    if (type1 == mshadow::kFloat64 || type2 == mshadow::kFloat64) {
+      return mshadow::kFloat64;
+    }
+    if (type1 == mshadow::kFloat32 || type2 == mshadow::kFloat32) {
+      return mshadow::kFloat32;
+    }
+    return mshadow::kFloat16;
+  } else if (is_float(type1) || is_float(type2)) {
+    return is_float(type1) ? type1 : type2;
+  }
+  if (type1 == mshadow::kInt64 || type2 == mshadow::kInt64) {
+    return mshadow::kInt64;
+  }
+  if (type1 == mshadow::kInt32 || type2 == mshadow::kInt32) {
+    return mshadow::kInt32;
+  }
+  CHECK(!((type1 == mshadow::kUint8 && type2 == mshadow::kInt8) ||
+          (type1 == mshadow::kInt8 && type2 == mshadow::kUint8)))
+    << "1 is UInt8 and 1 is Int8 should not get here";
+  if (type1 == mshadow::kUint8 || type2 == mshadow::kUint8) {
+    return mshadow::kUint8;
+  }
+  return mshadow::kInt8;
+}
+
+inline int np_binary_out_type(const int type1, const int type2) {
+  if ((type1 == mshadow::kUint8 && type2 == mshadow::kInt8) ||
+      (type1 == mshadow::kInt8 && type2 == mshadow::kUint8)) {
+    return mshadow::kInt32;
+  }
+  return more_precise_type(type1, type2);
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index d73fa1be54a4..3d81cfc0d967 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -134,8 +134,7 @@ class LeakyReLUOp : public Operator {
             mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
             mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
             mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType,
-                                                               mshadow_op::xelu>, xpu>::
+            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, mshadow_op::xelu>, xpu>::
             template LaunchEx(s, new_oshape.Size(), req[leakyrelu::kOut], lstride, rstride, oshape,
             in_data[leakyrelu::kData].dptr<DType>(), in_data[leakyrelu::kGamma].dptr<DType>(),
             out_data[leakyrelu::kOut].dptr<DType>());
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index c5a2b1308c73..1ece97b0efd8 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -132,6 +132,26 @@ struct true_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static float Map(DType a, DType b) {
     return static_cast<float>(a) / static_cast<float>(b);
   }
+
+#ifndef _WIN32
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return static_cast<mshadow::half::half_t>(a) / b;
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return static_cast<float>(a) / b;
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return static_cast<double>(a) / b;
+  }
+#endif
 };
 
 struct rtrue_divide : public mxnet_op::tunable  {
@@ -146,6 +166,26 @@ struct rtrue_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static float Map(DType a, DType b) {
     return static_cast<float>(b) / static_cast<float>(a);
   }
+
+#ifndef _WIN32
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return b / static_cast<mshadow::half::half_t>(a);
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return b / static_cast<float>(a);
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return b / static_cast<double>(a);
+  }
+#endif
 };
 
 MXNET_BINARY_MATH_OP_NC(left, a);
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 91478660a123..5d297a547c8f 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -471,6 +471,69 @@ struct AccType<mshadow::half::half_t> {
       {__VA_ARGS__}                                        \
     }                                                      \
     break;                                                 \
+  case mshadow::kBool:                                     \
+    {                                                      \
+      typedef bool DType;                                  \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  default:                                                 \
+    LOG(FATAL) << "Unknown type enum " << type;            \
+  }
+
+#define MXNET_INT32_INT64_TYPE_SWITCH(type, DType, ...)\
+  switch (type) {                                          \
+  case mshadow::kFloat32:                                  \
+    {                                                      \
+      typedef float DType;                                 \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not float32";          \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat64:                                  \
+    {                                                      \
+      typedef double DType;                                \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not float64";          \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat16:                                  \
+    {                                                      \
+      typedef mshadow::half::half_t DType;                 \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not float16";          \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kUint8:                                    \
+    {                                                      \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not uint8";            \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt8:                                     \
+    {                                                      \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not int8";             \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt32:                                    \
+    {                                                      \
+      typedef int32_t DType;                               \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt64:                                    \
+    {                                                      \
+      typedef int64_t DType;                               \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kBool:                                     \
+    {                                                      \
+      LOG(FATAL) << "This operation only support "         \
+                    "integer types, not bool";             \
+    }                                                      \
+    break;                                                 \
   default:                                                 \
     LOG(FATAL) << "Unknown type enum " << type;            \
   }
@@ -783,6 +846,56 @@ struct op_with_req {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
   }
 
+#ifndef _WIN32
+  /*! \brief inputs are two tensors with a half_t output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  mshadow::half::half_t *out,
+                                  const DType *lhs,
+                                  const mshadow::half::half_t *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
+  /*! \brief inputs are two tensors with a float output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, float *out, const DType *lhs, const float *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
+  /*! \brief inputs are two tensors with a double output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, double *out, const DType *lhs, const double *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
+  /*! \brief inputs are two tensors with a half_t output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  mshadow::half::half_t *out,
+                                  const DType *lhs,
+                                  const mshadow::half::half_t value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], value));
+  }
+
+  /*! \brief inputs are two tensors with a float output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, float *out, const DType *lhs, const float value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], value));
+  }
+
+  /*! \brief inputs are two tensors with a double output tensor */
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, double *out, const DType *lhs, const double value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], value));
+  }
+#endif
+
   /*! \brief inputs are two tensors with a float output tensor */
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 61239d33800c..1eff5cd8591d 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -394,8 +394,7 @@ class DropoutOp {
               mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
               mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
               mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-              mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType,
-                               mshadow_op::mul>, xpu>::
+              mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, mshadow_op::mul>, xpu>::
               template LaunchEx(s, new_oshape.Size(), req[dropout::kOut],
               lstride, rstride, oshape,
               in.dptr<DType>(),
@@ -463,8 +462,7 @@ class DropoutOp {
             mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
             mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
             mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType,
-                             mshadow_op::mul>, xpu>::
+            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, mshadow_op::mul>, xpu>::
             template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
             grad.dptr<DType>(), mask.dptr<DType>(), gdata.dptr<DType>());
           });
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 601a0526650c..89da570c133b 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -790,7 +790,7 @@ void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
           << "Mask needs to be provided when using softmax with use_length=True.";
         type = inputs[1].type_flag_;
       }
-      MXNET_INT_TYPE_SWITCH(type, IType, {
+      MXNET_INT32_INT64_TYPE_SWITCH(type, IType, {
           IType* mask_ptr = nullptr;
           if (param.use_length.value()) {
             mask_ptr = inputs[1].dptr<IType>();
@@ -834,7 +834,7 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
                         const std::vector<TBlob>& outputs) {
   using namespace mxnet_op;
   if (softmax_use_length(attrs)) {
-    MXNET_INT_TYPE_SWITCH(inputs[2].type_flag_, IType, {
+    MXNET_INT32_INT64_TYPE_SWITCH(inputs[2].type_flag_, IType, {
       if (req[1] != kNullOp) {
         mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
           ctx.get_stream<xpu>(), outputs[1].Size(), outputs[1].dptr<IType>());
@@ -856,7 +856,7 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
   MXNET_REAL_ACC_TYPE_SWITCH(inputs[0].type_flag_, OType, AType, {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        MXNET_INT_TYPE_SWITCH(itype, IType, {
+        MXNET_INT32_INT64_TYPE_SWITCH(itype, IType, {
           IType * length_ptr = nullptr;
           if (softmax_use_length(attrs)) {
             length_ptr = inputs[2].dptr<IType>();
diff --git a/src/operator/numpy/np_true_divide-inl.h b/src/operator/numpy/np_true_divide-inl.h
index cc74e19aef8f..0bc60a08803e 100644
--- a/src/operator/numpy/np_true_divide-inl.h
+++ b/src/operator/numpy/np_true_divide-inl.h
@@ -43,30 +43,42 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp || outputs[0].Size() == 0U) return;
   using namespace mshadow;
+  using namespace mxnet_op;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const double alpha = nnvm::get<double>(attrs.parsed);
-  if (common::is_float(inputs[0].type_flag_)) {
+  const TBlob& data = inputs[0];
+  const TBlob& out = outputs[0];
+  if (out.type_flag_ == data.type_flag_) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>(), DType(alpha));
+        Kernel<op_with_req<OP, Req>, xpu>::Launch(
+          s, data.Size(), out.dptr<DType>(), data.dptr<DType>(), DType(alpha));
       });
     });
   } else {
+#ifndef _WIN32
     CHECK_EQ(outputs[0].type_flag_, kFloat32) << "true_divide only supports float32 output "
                                                  "when input's dtype is "
                                               << type_string(inputs[0].type_flag_);
     MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-            s, inputs[0].Size(), outputs[0].dptr<float>(), inputs[0].dptr<DType>(), DType(alpha));
+        Kernel<op_with_req<OP, Req>, xpu>::Launch(
+          s, data.Size(), out.dptr<float>(), data.dptr<DType>(),
+          static_cast<float>(alpha));
       });
     });
+#else
+    Tensor<xpu, 1, float> temp_tensor =
+      ctx.requested[0].get_space_typed<xpu, 1, float>(mshadow::Shape1(data.Size()), s);
+    TBlob temp_tblob(temp_tensor);
+    CastCompute<xpu>(attrs, ctx, {data}, {kWriteTo}, {temp_tblob});
+    TrueDivideScalarCompute<xpu, OP>(attrs, ctx, {temp_tblob}, req, outputs);
+#endif
   }
 }
 
-template<typename xpu, typename OP>
+template<typename xpu>
 void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
                                const OpContext &ctx,
                                const std::vector<TBlob> &inputs,
@@ -77,66 +89,254 @@ void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    if (common::is_float(inputs[0].type_flag_)) {
-      MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-        Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, outputs[0].Size(),
-                                                            outputs[0].dptr<DType>(),
-                                                            inputs[0].dptr<DType>(),
-                                                            inputs[1].dptr<DType>());
+
+  const TBlob& lhs = inputs[0];
+  const TBlob& rhs = inputs[1];
+  const TBlob& out = outputs[0];
+  if (lhs.type_flag_ == rhs.type_flag_) {
+    // Case when types of the 2 input tensors are the same
+    if (common::is_float(lhs.type_flag_)) {
+      // If both are the same floats, normal launch
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
+          Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
+            s, out.Size(), out.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>());
+        });
       });
     } else {
-      CHECK_EQ(outputs[0].type_flag_, kFloat32) << "true_divide only supports float32 output "
-                                                   "when input's dtype is "
-                                                << type_string(inputs[0].type_flag_);
-      MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-        Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, outputs[0].Size(),
-                                                            outputs[0].dptr<float>(),
-                                                            inputs[0].dptr<DType>(),
-                                                            inputs[1].dptr<DType>());
+      // If both are the same integers, output is float32
+      CHECK_EQ(out.type_flag_, kFloat32) << "true_divide only supports float32 output "
+                                            "when input's dtype is "
+                                         << type_string(lhs.type_flag_);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
+          Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
+            s, out.Size(), out.dptr<float>(), lhs.dptr<DType>(), rhs.dptr<DType>());
+        });
       });
     }
-  });
+  } else {
+#ifndef _WIN32
+    // Non-windows case: no usage of temporary space
+    // Case when types of the 2 input tensors are different
+    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+      // both lhs and rhs are float types, output type is the more precise one
+      LOG(ERROR) << "not implemented yet...";
+    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+      // one is float type, the other is integer type, the output type should be the same as float
+      CHECK_EQ(out.type_flag_,
+               common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
+        << "This case out type should be same as the float type";
+      if (common::is_float(lhs.type_flag_)) {
+        // lhs is the float one
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+            MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+              Kernel<op_with_req<mshadow_op::rtrue_divide, Req>, xpu>::Launch(
+                s, out.Size(), out.dptr<LType>(), rhs.dptr<RType>(), lhs.dptr<LType>());
+            });
+          });
+        });
+      } else {
+        // rhs is the float one
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+            MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+              Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
+                s, out.Size(), out.dptr<RType>(), lhs.dptr<LType>(), rhs.dptr<RType>());
+            });
+          });
+        });
+      }
+    } else {
+      // lhs is integer type, rhs is integer type, output type should be float
+      LOG(ERROR) << "not implemented yet...";
+    }
+#else
+    // Windows case: using temp space for casting the type
+    // Case when types of the 2 input tensors are different
+    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+      // both lhs and rhs are float types, output type is the more precise one
+      LOG(ERROR) << "not implemented yet...";
+    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+      // lhs is float type, rhs is integer type, the output type should be the same as lhs
+      CHECK_EQ(out.type_flag_,
+               common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
+        << "This case out type should be same as the float type";
+      TBlob temp_tblob;
+      if (common::is_float(lhs.type_flag_)) {
+        // lhs is the float one
+        MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+          Tensor<xpu, 1, LType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
+        });
+        CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+        TrueDivideElemwiseCompute<xpu>(
+          attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+      } else {
+        // rhs is the float one
+        MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+          Tensor<xpu, 1, RType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
+        });
+        CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+        TrueDivideElemwiseCompute<xpu>(
+          attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+      }
+    } else {
+      // lhs is integer type, rhs is integer type, output type should be float
+      LOG(ERROR) << "not implemented yet...";
+    }
+#endif
+  }
 }
 
-template<typename xpu, typename OP>
+template<typename xpu>
 void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<TBlob>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
   if (outputs[0].shape_.Size() == 0U) return;
+  CHECK_EQ(inputs.size(), 2U);
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
   if (!ndim) {
-    TrueDivideElemwiseCompute<xpu, OP>(attrs, ctx, inputs, req, outputs);
+    TrueDivideElemwiseCompute<xpu>(attrs, ctx, inputs, req, outputs);
   } else {
     if (req[0] == kNullOp) return;
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TBlob& lhs = inputs[0];
+    const TBlob& rhs = inputs[1];
+    const TBlob& out = outputs[0];
+#ifndef _WIN32
     BROADCAST_NDIM_SWITCH(ndim, NDim, {
       mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
-      mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
-      mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-      if (common::is_float(inputs[0].type_flag_)) {
-        MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-          mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType, OP>, xpu>::
-            template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-                              inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
-                              outputs[0].dptr<DType>());
-        });
-      } else {
-        CHECK_EQ(outputs[0].type_flag_, mshadow::kFloat32)
+      mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
+      mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
+      if (lhs.type_flag_ == rhs.type_flag_) {
+        // When the both inputs have the same data types
+        if (common::is_float(lhs.type_flag_)) {
+          // If both inputs are the same float types, output is the same float type
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
+            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>());
+          });
+        } else {
+          CHECK_EQ(out.type_flag_, mshadow::kFloat32)
             << "true_divide only supports float32 output when input's dtype is "
-            << type_string(inputs[0].type_flag_);
-        MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-          mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, float, OP>, xpu>::
-            template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-                              inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
-                              outputs[0].dptr<float>());
-        });
+            << type_string(lhs.type_flag_);
+          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
+            // If both inputs are the same integer types, output is float type
+            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<float>());
+          });
+        }
+      } else {
+        if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+          // lhs and rhs have different float types, the output is the more precise one
+          LOG(ERROR) << "not implemented yet...";
+        } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+          // one of lhs and rhs is float, the output is the same type as the float one
+          if (common::is_float(lhs.type_flag_)) {
+            // lhs is float type, output will be the same float type
+            CHECK_EQ(lhs.type_flag_, out.type_flag_)
+              << "lhs should have the same type as out, infer type broken?";
+            MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+              MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+                Kernel<binary_broadcast_kernel<NDim, mshadow_op::rtrue_divide>, xpu>::
+                  template LaunchEx(s, new_oshape.Size(), req[0], rstride, lstride, oshape,
+                                    rhs.dptr<RType>(), lhs.dptr<LType>(), out.dptr<LType>());
+              });
+            });
+          } else {
+            // rhs is float type, output will be the same float type
+            CHECK_EQ(rhs.type_flag_, out.type_flag_)
+              << "rhs should have the same type as out, infer type broken?";
+            MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+              MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+                Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+                  template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                    lhs.dptr<LType>(), rhs.dptr<RType>(), out.dptr<RType>());
+              });
+            });
+          }
+        } else {
+          // lhs and rhs have different integer types, the output is float type
+          LOG(ERROR) << "not implemented yet...";
+        }
       }
     });
+#else
+    if (lhs.type_flag_ == rhs.type_flag_) {
+      BROADCAST_NDIM_SWITCH(ndim, NDim, {
+        mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
+        mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
+        mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
+        // When the both inputs have the same data types
+        if (common::is_float(lhs.type_flag_)) {
+          // If both inputs are the same float types, output is the same float type
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
+            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>());
+          });
+        } else {
+          CHECK_EQ(out.type_flag_, mshadow::kFloat32)
+            << "true_divide only supports float32 output when input's dtype is "
+            << type_string(lhs.type_flag_);
+          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
+            // If both inputs are the same integer types, output is float type
+            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<float>());
+          });
+        }
+      });
+    } else {
+      if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+        // lhs and rhs have different float types, the output is the more precise one
+        LOG(ERROR) << "not implemented yet...";
+      } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+        // one of lhs and rhs is float, the output is the same type as the float one
+        TBlob temp_tblob;
+        if (common::is_float(lhs.type_flag_)) {
+          // lhs is float type, output will be the same float type
+          CHECK_EQ(lhs.type_flag_, out.type_flag_)
+            << "lhs should have the same type as out, infer type broken?";
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+            Tensor<xpu, 1, LType> temp_tensor =
+              ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
+            temp_tblob = TBlob(temp_tensor);
+          });
+          CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+          TrueDivideBroadcastCompute<xpu>(
+            attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+        } else {
+          // rhs is float type, output will be the same float type
+          CHECK_EQ(rhs.type_flag_, out.type_flag_)
+            << "rhs should have the same type as out, infer type broken?";
+          MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+            Tensor<xpu, 1, RType> temp_tensor =
+              ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
+            temp_tblob = TBlob(temp_tensor);
+          });
+          CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+          TrueDivideBroadcastCompute<xpu>(
+            attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+        }
+      } else {
+        // lhs and rhs have different integer types, the output is float type
+        LOG(ERROR) << "not implemented yet...";
+      }
+    }
+#endif
   }
 }
 
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
index 5a4634c3ff8c..d2135befef42 100644
--- a/src/operator/numpy/np_true_divide.cc
+++ b/src/operator/numpy/np_true_divide.cc
@@ -28,26 +28,35 @@
 namespace mxnet {
 namespace op {
 
+int TrueDivideOutType(int ltype, int rtype) {
+  if (common::is_float(ltype) && common::is_float(rtype)) {
+    // If both inputs are float, return the one with the higher precision
+    return common::more_precise_type(ltype, rtype);
+  } else if (common::is_float(ltype) || common::is_float(rtype)) {
+    // If only one of the inputs is float, return that float type
+    return (common::is_float(ltype)) ? ltype : rtype;
+  }
+  // If neither of the inputs is float, return the default float32 type
+  return mshadow::kFloat32;
+}
+
 template <int num_inputs>
 bool TrueDivideType(const nnvm::NodeAttrs& attrs,
                     std::vector<int>* in_attrs,
                     std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(num_inputs));
+  CHECK_GT(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 1U);
+
   for (const int dtype : *in_attrs) {
     if (dtype == -1) return false;
   }
-  if (num_inputs == 2) {
-    const int lhs_dtype = in_attrs->at(0);
-    const int rhs_dtype = in_attrs->at(1);
-    CHECK_EQ(lhs_dtype, rhs_dtype)
-        << "true_divide currently only supports same dtype for dividend and divisor";
-  }
-  if (common::is_float(in_attrs->at(0))) {
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  } else {
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
-  }
+
+  const int lhs_dtype = in_attrs->at(0);
+  const int rhs_dtype = (num_inputs == 2) ?
+                        in_attrs->at(1) :
+                        (common::is_float(lhs_dtype) ? lhs_dtype : mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, TrueDivideOutType(lhs_dtype, rhs_dtype));
   return true;
 }
 
@@ -64,7 +73,13 @@ NNVM_REGISTER_OP(_npi_true_divide)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
   })
-.set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu, op::mshadow_op::true_divide>)
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
+.set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
 .add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
 .add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
@@ -81,6 +96,12 @@ NNVM_REGISTER_OP(_npi_true_divide_scalar)
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, op::mshadow_op::true_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
@@ -98,6 +119,12 @@ NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, mshadow_op::rtrue_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_rdiv_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
index c026d689233d..7211f4a0a006 100644
--- a/src/operator/numpy/np_true_divide.cu
+++ b/src/operator/numpy/np_true_divide.cu
@@ -29,7 +29,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_true_divide)
-.set_attr<FCompute>("FCompute<gpu>", TrueDivideBroadcastCompute<gpu, mshadow_op::true_divide>);
+.set_attr<FCompute>("FCompute<gpu>", TrueDivideBroadcastCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_attr<FCompute>("FCompute<gpu>", TrueDivideScalarCompute<gpu, mshadow_op::true_divide>);
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 3d3bcfacbd05..ad06df8d92be 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -187,9 +187,10 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet:
 }
 
 namespace mxnet_op {
-template<int ndim, typename IType, typename DType, typename OP>
+template<int ndim, typename OP>
 struct binary_broadcast_kernel {
   /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType>
   MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
                                   const Shape <ndim> &lstride, const Shape <ndim> &rstride,
                                   const Shape <ndim> &oshape, IType *lhs, IType *rhs,
@@ -208,6 +209,7 @@ struct binary_broadcast_kernel {
   }
 
   /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType>
   MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
                                   const Shape <ndim> &lstride, const Shape <ndim> &rstride,
                                   const Shape <ndim> &oshape, IType lhs, IType *rhs,
@@ -224,6 +226,49 @@ struct binary_broadcast_kernel {
       KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
     }
   }
+
+#ifndef _WIN32
+  /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType,
+           typename std::enable_if<!std::is_same<IType, DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, IType *lhs, DType *rhs,
+                                  DType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
+    }
+  }
+
+  /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType,
+           typename std::enable_if<!std::is_same<IType, DType>::value &&
+                                   !std::is_pointer<IType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, IType lhs, DType *rhs,
+                                  DType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
+    }
+  }
+#endif
 };
 
 template<int req, typename OP, bool col_vec>
@@ -307,7 +352,7 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
           mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
           mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
           mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-          mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, DType, OP>, xpu>::
+          mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, OP>, xpu>::
           template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
           inputs[0].dptr<DType>(), inputs[1].dptr<DType>(), outputs[0].dptr<DType>());
         });
@@ -336,7 +381,7 @@ void BinaryBroadcastComputeLogic(const nnvm::NodeAttrs& attrs,
             mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
             mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
             mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType, bool, OP>, xpu>::
+            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, OP>, xpu>::
             template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
                               inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
                               outputs[0].dptr<bool>());
@@ -444,11 +489,11 @@ void BinaryBroadcastCsrDnsDnsImpl(const OpContext& ctx,
       Shape<NDim> lstride = calc_stride(new_csrshape.get<NDim>());
       Shape<NDim> rstride = calc_stride(new_dnsshape.get<NDim>());
       if (reverse && std::is_same<OP, mshadow_op::minus>::value) {
-        Kernel<binary_broadcast_kernel<NDim, DType, DType, mshadow_op::plus>, xpu>::
+        Kernel<binary_broadcast_kernel<NDim, mshadow_op::plus>, xpu>::
         template LaunchEx(s, new_oshape.Size(), req, lstride, rstride, oshape,
         DType(0), dns_data.dptr<DType>(), out_data.dptr<DType>());
       } else {
-        Kernel<binary_broadcast_kernel<NDim, DType, DType, OP>, xpu>::
+        Kernel<binary_broadcast_kernel<NDim, OP>, xpu>::
         template LaunchEx(s, new_oshape.Size(), req, lstride, rstride, oshape,
         DType(0), dns_data.dptr<DType>(), out_data.dptr<DType>());
       }
@@ -658,7 +703,7 @@ void BinaryBroadcastBackwardUseIn(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs) {                                      \
       return std::vector<std::string>{"lhs", "rhs"};                  \
     })                                                                \
-  .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)   \
+  .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)  \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)       \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                   \
     [](const NodeAttrs& attrs){                                       \
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 02b005eed995..834bbdbfc3d1 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -256,7 +256,7 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const double alpha = nnvm::get<double>(attrs.parsed);
-    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
           mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
               s, inputs[0].Size(), outputs[0].dptr<bool>(), inputs[0].dptr<DType>(), DType(alpha));
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 3b4b4b6491fc..e6d12da23582 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1940,7 +1940,7 @@ def get_new_shape(shape, axis):
 
                     with mx.autograd.record():
                         y = test_concat(a, b, c, d)
-                    
+
                     assert y.shape == expected_ret.shape
                     assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
 
@@ -2933,7 +2933,7 @@ def check_cholesky(L, data_np):
         test_cholesky = TestCholesky()
         if hybridize:
             test_cholesky.hybridize()
-        
+
         # Numerical issue:
         # When backpropagating through Cholesky decomposition, we need to compute the inverse
         # of L according to dA = 0.5 * L**(-T) * copyLTU(L**T * dL) * L**(-1) where A = LL^T.
@@ -3847,12 +3847,14 @@ def test_np_true_divide():
         [(2, 3, 1), (1, 4)],
         [(2, 1, 4, 1), (3, 1, 5)],
     ]
-    dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
+    dtypes = [np.bool, np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
+    itypes = [np.bool, np.int8, np.uint8, np.int32, np.int64]
+    ftypes = [np.float16, np.float32, np.float64]
     for shape_pair, dtype in itertools.product(shapes, dtypes):
         a = np.random.uniform(3, 50, size=shape_pair[0]).astype(dtype)
         b = np.random.uniform(3, 50, size=shape_pair[-1]).astype(dtype)
         out_mx = a / b
-        if _np.issubdtype(dtype, _np.integer):
+        if _np.issubdtype(dtype, _np.integer) or (dtype is np.bool):
             assert out_mx.dtype == np.float32
         else:
             assert out_mx.dtype == dtype
@@ -3868,6 +3870,20 @@ def test_np_true_divide():
         out_np = _np.true_divide(val, a.asnumpy())
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
 
+    for shape_pair, itype, ftype in itertools.product(shapes, itypes, ftypes):
+        i_ = np.random.uniform(3, 50, size=shape_pair[0]).astype(itype)
+        f_ = np.random.uniform(3, 50, size=shape_pair[-1]).astype(ftype)
+
+        out_mx = i_ / f_
+        assert out_mx.dtype == ftype
+        out_np = _np.true_divide(i_.asnumpy(), f_.asnumpy())
+        assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
+
+        out_mx = f_ / i_
+        assert out_mx.dtype == ftype
+        out_np = _np.true_divide(f_.asnumpy(), i_.asnumpy())
+        assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
+
 
 @with_seed()
 @use_np