From f6aa9e973656bebaf650bcd137ae63a0ecb3aefa Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Fri, 1 Nov 2019 23:56:03 +0000
Subject: [PATCH] workaround for windows

---
 src/operator/mshadow_op.h                     |   6 +
 .../numpy/np_elemwise_broadcast_op.cc         |  36 +++-
 .../numpy/np_elemwise_broadcast_op.cu         |   9 +-
 src/operator/numpy/np_elemwise_broadcast_op.h |  47 +++--
 src/operator/numpy/np_true_divide-inl.h       | 172 ++++++++++++------
 src/operator/numpy/np_true_divide.cc          |  18 ++
 tests/python/unittest/test_numpy_op.py        |  30 ++-
 7 files changed, 224 insertions(+), 94 deletions(-)
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 765bfc2588b8..e586a1f4ad49 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -133,6 +133,7 @@ struct true_divide : public mxnet_op::tunable  {
     return static_cast<float>(a) / static_cast<float>(b);
   }
 
+#ifndef _WIN32
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
@@ -150,6 +151,7 @@ struct true_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     return static_cast<double>(a) / b;
   }
+#endif
 };
 
 struct rtrue_divide : public mxnet_op::tunable  {
@@ -165,6 +167,7 @@ struct rtrue_divide : public mxnet_op::tunable  {
     return static_cast<float>(b) / static_cast<float>(a);
   }
 
+#ifndef _WIN32
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
@@ -182,6 +185,7 @@ struct rtrue_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     return b / static_cast<double>(a);
   }
+#endif
 };
 
 MXNET_BINARY_MATH_OP_NC(left, a);
@@ -190,6 +194,7 @@ MXNET_BINARY_MATH_OP_NC(right, b);
 
 MXNET_BINARY_MATH_OP_NC(mul, a * b);
 
+#ifndef _WIN32
 struct mixed_mul {
   template<typename DType,
            typename std::enable_if<!std::is_pointer<DType>::value, int>::type = 0>
@@ -197,6 +202,7 @@ struct mixed_mul {
     return static_cast<DType>(a) * b;
   }
 };
+#endif
 
 MXNET_BINARY_MATH_OP_NC(div, a / b);
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
index 592ea9e84ea9..70943f0bab7f 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -54,7 +54,6 @@ bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
   .add_argument("data", "NDArray-or-Symbol", "source input")        \
   .add_argument("scalar", "float", "scalar input")
 
-#ifndef _WIN32
 bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
                                    std::vector<int>* in_attrs,
                                    std::vector<int>* out_attrs) {
@@ -71,6 +70,28 @@ bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+#ifdef _WIN32
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(name)                \
+  NNVM_REGISTER_OP(name)                                                       \
+  .set_num_inputs(2)                                                           \
+  .set_num_outputs(1)                                                          \
+  .set_attr<nnvm::FListInputNames>("FListInputNames",                          \
+    [](const NodeAttrs& attrs) {                                               \
+      return std::vector<std::string>{"lhs", "rhs"};                           \
+    })                                                                         \
+  .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)           \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryMixedPrecisionType)     \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                            \
+    [](const NodeAttrs& attrs){                                                \
+      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};                \
+    })                                                                         \
+  .set_attr<FResourceRequest>("FResourceRequest",                              \
+  [](const NodeAttrs& attrs) {                                                 \
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};          \
+  })                                                                           \
+  .add_argument("lhs", "NDArray-or-Symbol", "First input to the function")     \
+  .add_argument("rhs", "NDArray-or-Symbol", "Second input to the function")
+#else
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(name)                \
   NNVM_REGISTER_OP(name)                                                       \
   .set_num_inputs(2)                                                           \
@@ -97,12 +118,18 @@ MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_subtract)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::minus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"});
 
-#ifndef _WIN32
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_multiply)
+#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   MixedBinaryBroadcastCompute<cpu, op::mshadow_op::mul, op::mshadow_op::mixed_mul,
                               op::mshadow_op::mixed_mul>)
+#else
+.set_attr<FCompute>(
+  "FCompute<cpu>",
+  MixedBinaryBroadcastCompute<cpu, op::mshadow_op::mul, op::mshadow_op::mul,
+                              op::mshadow_op::mul>)
+#endif
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mul"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
@@ -119,11 +146,6 @@ NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
   })
 .set_attr<FCompute>("FCompute<cpu>", MixedBinaryBackwardUseIn<cpu, mshadow_op::right,
                                                               mshadow_op::left>);
-#else
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_multiply)
-.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::mul>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"});
-#endif
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_mod)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::mod>)
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index a184c0b84a35..66c9b8e74e7c 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -41,13 +41,16 @@ NNVM_REGISTER_OP(_npi_multiply)
   "FCompute<gpu>",
   MixedBinaryBroadcastCompute<gpu, op::mshadow_op::mul, op::mshadow_op::mixed_mul,
                               op::mshadow_op::mixed_mul>);
+#else
+.set_attr<FCompute>(
+  "FCompute<gpu>",
+  MixedBinaryBroadcastCompute<gpu, op::mshadow_op::mul, op::mshadow_op::mul,
+                              op::mshadow_op::mul>);
+#endif
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
 .set_attr<FCompute>("FCompute<gpu>", MixedBinaryBackwardUseIn<gpu, mshadow_op::right,
                                                               mshadow_op::left>);
-#else
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::mul>);
-#endif
 
 NNVM_REGISTER_OP(_npi_mod)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::mod>);
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.h b/src/operator/numpy/np_elemwise_broadcast_op.h
index 081af396bd6b..55e637158613 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.h
+++ b/src/operator/numpy/np_elemwise_broadcast_op.h
@@ -39,7 +39,6 @@ void MixedBinaryElemwiseCompute(const nnvm::NodeAttrs& attrs,
                                 const std::vector<TBlob>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<TBlob>& outputs) {
-  // TODO(haojin2): No mixed-precision multiply on windows temporarily due to CI issues.
 #ifndef _WIN32
   using namespace mshadow;
   using namespace mxnet_op;
@@ -71,7 +70,7 @@ void MixedBinaryElemwiseCompute(const nnvm::NodeAttrs& attrs,
     });
   });
 #else
-  LOG(ERROR) << "mixed precision multiply is not supported on windows yet...";
+  LOG(ERROR) << "windows should not reach here...";
 #endif
 }
 
@@ -92,22 +91,18 @@ void MixedBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
 
   if ((out.shape_.Size() == 0U) || (req[0] == kNullOp)) return;
 
-  mxnet::TShape new_lshape, new_rshape, new_oshape;
-  int ndim = BinaryBroadcastShapeCompact(lhs.shape_, rhs.shape_, out.shape_,
-                                         &new_lshape, &new_rshape, &new_oshape);
-
-
   if (lhs.type_flag_ == rhs.type_flag_) {
     BinaryBroadcastCompute<xpu, OP>(attrs, ctx, inputs, req, outputs);
     return;
   }
 
-  // TODO(haojin2): No mixed-precision multiply on windows temporarily due to CI issues.
-#ifndef _WIN32
   CHECK((lhs.type_flag_ == mshadow::kBool) || (rhs.type_flag_ == mshadow::kBool))
     << "now supports bool with another type only";
 
-
+#ifndef _WIN32
+  mxnet::TShape new_lshape, new_rshape, new_oshape;
+  int ndim = BinaryBroadcastShapeCompact(lhs.shape_, rhs.shape_, out.shape_,
+                                         &new_lshape, &new_rshape, &new_oshape);
   if (!ndim) {
     MixedBinaryElemwiseCompute<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
   } else {
@@ -130,7 +125,37 @@ void MixedBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
     });
   }
 #else
-  LOG(ERROR) << "mixed precision multiply is not supported on windows yet...";
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+    LOG(ERROR) << "not implemented yet...";
+  } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+    TBlob temp_tblob;
+    // one is float, the other is bool
+    CHECK_EQ(out.type_flag_,
+             common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
+      << "This case out type should be same as the float type";
+    if (common::is_float(lhs.type_flag_)) {
+      MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+        Tensor<xpu, 1, LType> temp_tensor =
+          ctx.requested[0].get_space_typed<xpu, 1, LType>(Shape1(rhs.Size()), s);
+        temp_tblob = TBlob(temp_tensor);
+      });
+      CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+      BinaryBroadcastCompute<xpu, OP>(
+        attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+    } else {
+      MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+        Tensor<xpu, 1, RType> temp_tensor =
+          ctx.requested[0].get_space_typed<xpu, 1, RType>(Shape1(lhs.Size()), s);
+        temp_tblob = TBlob(temp_tensor);
+      });
+      CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+      BinaryBroadcastCompute<xpu, OP>(
+        attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+    }
+  } else {
+    LOG(ERROR) << "not implemented yet...";
+  }
 #endif
 }
 
diff --git a/src/operator/numpy/np_true_divide-inl.h b/src/operator/numpy/np_true_divide-inl.h
index 8aa32661fd3c..0bc60a08803e 100644
--- a/src/operator/numpy/np_true_divide-inl.h
+++ b/src/operator/numpy/np_true_divide-inl.h
@@ -57,6 +57,7 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
       });
     });
   } else {
+#ifndef _WIN32
     CHECK_EQ(outputs[0].type_flag_, kFloat32) << "true_divide only supports float32 output "
                                                  "when input's dtype is "
                                               << type_string(inputs[0].type_flag_);
@@ -67,6 +68,13 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
           static_cast<float>(alpha));
       });
     });
+#else
+    Tensor<xpu, 1, float> temp_tensor =
+      ctx.requested[0].get_space_typed<xpu, 1, float>(mshadow::Shape1(data.Size()), s);
+    TBlob temp_tblob(temp_tensor);
+    CastCompute<xpu>(attrs, ctx, {data}, {kWriteTo}, {temp_tblob});
+    TrueDivideScalarCompute<xpu, OP>(attrs, ctx, {temp_tblob}, req, outputs);
+#endif
   }
 }
 
@@ -85,85 +93,104 @@ void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
   const TBlob& lhs = inputs[0];
   const TBlob& rhs = inputs[1];
   const TBlob& out = outputs[0];
-  // TODO(haojin2): No mixed-precision true_divide on windows temporarily due to CI issues.
-#ifndef _WIN32
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    if (lhs.type_flag_ == rhs.type_flag_) {
-      // Case when types of the 2 input tensors are the same
-      if (common::is_float(lhs.type_flag_)) {
-        // If both are the same floats, normal launch
+  if (lhs.type_flag_ == rhs.type_flag_) {
+    // Case when types of the 2 input tensors are the same
+    if (common::is_float(lhs.type_flag_)) {
+      // If both are the same floats, normal launch
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
           Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
             s, out.Size(), out.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>());
         });
-      } else {
-        // If both are the same integers, output is float32
-        CHECK_EQ(out.type_flag_, kFloat32) << "true_divide only supports float32 output "
-                                              "when input's dtype is "
-                                           << type_string(lhs.type_flag_);
+      });
+    } else {
+      // If both are the same integers, output is float32
+      CHECK_EQ(out.type_flag_, kFloat32) << "true_divide only supports float32 output "
+                                            "when input's dtype is "
+                                         << type_string(lhs.type_flag_);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
           Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
             s, out.Size(), out.dptr<float>(), lhs.dptr<DType>(), rhs.dptr<DType>());
         });
-      }
-    } else {
-      // Case when types of the 2 input tensors are different
-      if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
-        // both lhs and rhs are float types, output type is the more precise one
-        LOG(ERROR) << "not implemented yet...";
-      } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-        // lhs is float type, rhs is integer type, the output type should be the same as lhs
-        CHECK_EQ(out.type_flag_,
-                 common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
-          << "This case out type should be same as the float type";
-        if (common::is_float(lhs.type_flag_)) {
-          // lhs is the float one
+      });
+    }
+  } else {
+#ifndef _WIN32
+    // Non-windows case: no usage of temporary space
+    // Case when types of the 2 input tensors are different
+    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+      // both lhs and rhs are float types, output type is the more precise one
+      LOG(ERROR) << "not implemented yet...";
+    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+      // one is float type, the other is integer type, the output type should be the same as float
+      CHECK_EQ(out.type_flag_,
+               common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
+        << "This case out type should be same as the float type";
+      if (common::is_float(lhs.type_flag_)) {
+        // lhs is the float one
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
           MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
             MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
               Kernel<op_with_req<mshadow_op::rtrue_divide, Req>, xpu>::Launch(
                 s, out.Size(), out.dptr<LType>(), rhs.dptr<RType>(), lhs.dptr<LType>());
             });
           });
-        } else {
-          // rhs is the float one
+        });
+      } else {
+        // rhs is the float one
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
           MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
             MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
               Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
                 s, out.Size(), out.dptr<RType>(), lhs.dptr<LType>(), rhs.dptr<RType>());
             });
           });
-        }
-      } else {
-        // lhs is integer type, rhs is integer type, output type should be float
-        LOG(ERROR) << "not implemented yet...";
+        });
       }
+    } else {
+      // lhs is integer type, rhs is integer type, output type should be float
+      LOG(ERROR) << "not implemented yet...";
     }
-  });
 #else
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    if (lhs.type_flag_ == rhs.type_flag_) {
-      // Case when types of the 2 input tensors are the same
+    // Windows case: using temp space for casting the type
+    // Case when types of the 2 input tensors are different
+    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+      // both lhs and rhs are float types, output type is the more precise one
+      LOG(ERROR) << "not implemented yet...";
+    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+      // lhs is float type, rhs is integer type, the output type should be the same as lhs
+      CHECK_EQ(out.type_flag_,
+               common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
+        << "This case out type should be same as the float type";
+      TBlob temp_tblob;
       if (common::is_float(lhs.type_flag_)) {
-        // If both are the same floats, normal launch
-        MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
-          Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
-            s, out.Size(), out.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>());
+        // lhs is the float one
+        MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+          Tensor<xpu, 1, LType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
         });
+        CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+        TrueDivideElemwiseCompute<xpu>(
+          attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
       } else {
-        // If both are the same integers, output is float32
-        CHECK_EQ(out.type_flag_, kFloat32) << "true_divide only supports float32 output "
-                                              "when input's dtype is "
-                                           << type_string(lhs.type_flag_);
-        MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
-          Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
-            s, out.Size(), out.dptr<float>(), lhs.dptr<DType>(), rhs.dptr<DType>());
+        // rhs is the float one
+        MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+          Tensor<xpu, 1, RType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
         });
+        CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+        TrueDivideElemwiseCompute<xpu>(
+          attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
       }
     } else {
-      LOG(ERROR) << "mixed precision true_divide is not supported on windows yet...";
+      // lhs is integer type, rhs is integer type, output type should be float
+      LOG(ERROR) << "not implemented yet...";
     }
-  });
 #endif
+  }
 }
 
 template<typename xpu>
@@ -186,7 +213,6 @@ void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
     const TBlob& lhs = inputs[0];
     const TBlob& rhs = inputs[1];
     const TBlob& out = outputs[0];
-    // TODO(haojin2): No mixed-precision true_divide on windows temporarily due to CI issues.
 #ifndef _WIN32
     BROADCAST_NDIM_SWITCH(ndim, NDim, {
       mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
@@ -248,11 +274,11 @@ void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
       }
     });
 #else
-    BROADCAST_NDIM_SWITCH(ndim, NDim, {
-      mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
-      mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
-      mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
-      if (lhs.type_flag_ == rhs.type_flag_) {
+    if (lhs.type_flag_ == rhs.type_flag_) {
+      BROADCAST_NDIM_SWITCH(ndim, NDim, {
+        mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
+        mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
+        mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
         // When the both inputs have the same data types
         if (common::is_float(lhs.type_flag_)) {
           // If both inputs are the same float types, output is the same float type
@@ -272,10 +298,44 @@ void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
                                 lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<float>());
           });
         }
+      });
+    } else {
+      if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+        // lhs and rhs have different float types, the output is the more precise one
+        LOG(ERROR) << "not implemented yet...";
+      } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+        // one of lhs and rhs is float, the output is the same type as the float one
+        TBlob temp_tblob;
+        if (common::is_float(lhs.type_flag_)) {
+          // lhs is float type, output will be the same float type
+          CHECK_EQ(lhs.type_flag_, out.type_flag_)
+            << "lhs should have the same type as out, infer type broken?";
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+            Tensor<xpu, 1, LType> temp_tensor =
+              ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
+            temp_tblob = TBlob(temp_tensor);
+          });
+          CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+          TrueDivideBroadcastCompute<xpu>(
+            attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+        } else {
+          // rhs is float type, output will be the same float type
+          CHECK_EQ(rhs.type_flag_, out.type_flag_)
+            << "rhs should have the same type as out, infer type broken?";
+          MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+            Tensor<xpu, 1, RType> temp_tensor =
+              ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
+            temp_tblob = TBlob(temp_tensor);
+          });
+          CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+          TrueDivideBroadcastCompute<xpu>(
+            attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+        }
       } else {
-        LOG(ERROR) << "mixed precision true_divide is not supported on windows yet...";
+        // lhs and rhs have different integer types, the output is float type
+        LOG(ERROR) << "not implemented yet...";
       }
-    });
+    }
 #endif
   }
 }
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
index 83493041dea9..d2135befef42 100644
--- a/src/operator/numpy/np_true_divide.cc
+++ b/src/operator/numpy/np_true_divide.cc
@@ -73,6 +73,12 @@ NNVM_REGISTER_OP(_npi_true_divide)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
   })
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
 .add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
@@ -90,6 +96,12 @@ NNVM_REGISTER_OP(_npi_true_divide_scalar)
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, op::mshadow_op::true_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
@@ -107,6 +119,12 @@ NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
+#ifdef _WIN32
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, mshadow_op::rtrue_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_rdiv_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index fd86545014ee..5927f1cbffc7 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1684,9 +1684,6 @@ def hybrid_forward(self, F, a, b, *args, **kwargs):
         assert_almost_equal(mx_out.asnumpy(), np_out.astype(mx_out.dtype), rtol=1e-3, atol=1e-5,
                             use_broadcast=False, equal_nan=True)
 
-    if sys.platform.startswith('win'):
-        return
-
     funcs = {
         'multiply': (-1.0, 1.0),
     }
@@ -1998,7 +1995,7 @@ def get_new_shape(shape, axis):
 
                     with mx.autograd.record():
                         y = test_concat(a, b, c, d)
-                    
+
                     assert y.shape == expected_ret.shape
                     assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
 
@@ -2991,7 +2988,7 @@ def check_cholesky(L, data_np):
         test_cholesky = TestCholesky()
         if hybridize:
             test_cholesky.hybridize()
-        
+
         # Numerical issue:
         # When backpropagating through Cholesky decomposition, we need to compute the inverse
         # of L according to dA = 0.5 * L**(-T) * copyLTU(L**T * dL) * L**(-1) where A = LL^T.
@@ -3928,20 +3925,19 @@ def test_np_true_divide():
         out_np = _np.true_divide(val, a.asnumpy())
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
 
-    if not sys.platform.startswith('win'):
-        for shape_pair, itype, ftype in itertools.product(shapes, itypes, ftypes):
-            i_ = np.random.uniform(3, 50, size=shape_pair[0]).astype(itype)
-            f_ = np.random.uniform(3, 50, size=shape_pair[-1]).astype(ftype)
+    for shape_pair, itype, ftype in itertools.product(shapes, itypes, ftypes):
+        i_ = np.random.uniform(3, 50, size=shape_pair[0]).astype(itype)
+        f_ = np.random.uniform(3, 50, size=shape_pair[-1]).astype(ftype)
 
-            out_mx = i_ / f_
-            assert out_mx.dtype == ftype
-            out_np = _np.true_divide(i_.asnumpy(), f_.asnumpy())
-            assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
+        out_mx = i_ / f_
+        assert out_mx.dtype == ftype
+        out_np = _np.true_divide(i_.asnumpy(), f_.asnumpy())
+        assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
 
-            out_mx = f_ / i_
-            assert out_mx.dtype == ftype
-            out_np = _np.true_divide(f_.asnumpy(), i_.asnumpy())
-            assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
+        out_mx = f_ / i_
+        assert out_mx.dtype == ftype
+        out_np = _np.true_divide(f_.asnumpy(), i_.asnumpy())
+        assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
 
 
 @with_seed()