From 7908d7eb56fc9d20c12afffd8ea592b959b80bfc Mon Sep 17 00:00:00 2001
From: Yiyan66 <57363390+Yiyan66@users.noreply.github.com>
Date: Tue, 28 Jul 2020 15:11:19 +0800
Subject: [PATCH] [numpy] fix flaky mixed precision binary error (#18660)

* temp

* change test

* fix bad func call

* test

* rectify

* doc

* change test
---
 .../numpy/np_elemwise_broadcast_logic_op.cc   | 42 +++++++++++++++++--
 tests/python/unittest/test_numpy_op.py        | 22 +++++++++-
 2 files changed, 60 insertions(+), 4 deletions(-)
diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
index b191553f16da..9aacbc02b061 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
@@ -79,7 +79,9 @@ TBlob PrependAxes(const TBlob& src, const int dst_ndim) {
   return src.reshape(dst_shape);
 }
 
-struct TVMBinaryBroadcastCompute {
+
+template<typename xpu, typename OP>
+struct GetBinaryBroadcastCompute {
   const char* func;
   void operator()(const nnvm::NodeAttrs& attrs,
                   const mxnet::OpContext& ctx,
@@ -96,6 +98,38 @@ struct TVMBinaryBroadcastCompute {
     std::vector<int> type_codes;
     std::vector<TVMValue> values;
 
+    const TBlob& a = inputs[0];
+    const TBlob& b = inputs[1];
+    if (a.type_flag_ != b.type_flag_) {
+      if (outputs[0].shape_.Size() == 0U) return;
+      mxnet::TShape new_lshape, new_rshape, new_oshape;
+      const TBlob& lhs = inputs[0];
+      const TBlob& rhs = inputs[1];
+      const TBlob& out = outputs[0];
+      int ndim = BinaryBroadcastShapeCompact(lhs.shape_, rhs.shape_, out.shape_,
+                                            &new_lshape, &new_rshape, &new_oshape);
+      if (!ndim) {
+        ElemwiseBinaryOp::ComputeLogic<xpu, OP>(attrs, ctx, inputs, req, outputs);
+      } else {
+        if (req[0] == kNullOp) return;
+        mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+        MSHADOW_TYPE_SWITCH_WITH_BOOL(lhs.type_flag_, DType, {
+          MSHADOW_TYPE_SWITCH_WITH_BOOL(rhs.type_flag_, EType, {
+            BROADCAST_NDIM_SWITCH(ndim, NDim, {
+              mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
+              mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
+              mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
+              mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, OP>, xpu>::
+              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+                                lhs.dptr<DType>(), rhs.dptr<EType>(),
+                                out.dptr<bool>());
+            });
+          });
+        });
+      }
+      return;
+    }
+
     const int ondim = outputs[0].shape_.ndim();
     const size_t num_args = inputs.size() + outputs.size();
     type_codes.resize(num_args);
@@ -146,13 +180,15 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC(logical_xor);
 
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_CPU(name)                          \
   NNVM_REGISTER_OP(_npi_##name)                                                    \
-  .set_attr<FCompute>("FCompute<cpu>", TVMBinaryBroadcastCompute{func_##name##_cpu})
+  .set_attr<FCompute>("FCompute<cpu>", GetBinaryBroadcastCompute<cpu,              \
+                      mshadow_op::np_##name>{func_##name##_cpu})
 
 #if MXNET_USE_CUDA
 
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_GPU(name)                          \
   NNVM_REGISTER_OP(_npi_##name)                                                    \
-  .set_attr<FCompute>("FCompute<gpu>", TVMBinaryBroadcastCompute{func_##name##_gpu})
+  .set_attr<FCompute>("FCompute<gpu>", GetBinaryBroadcastCompute<gpu,              \
+                      mshadow_op::np_##name>{func_##name##_gpu})
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_GPU(equal);
 MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_GPU(not_equal);
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 76181e148950..e4564e92510e 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -3071,7 +3071,6 @@ def hybrid_forward(self, F, a, b, *args, **kwargs):
 
 @with_seed()
 @use_np
-@pytest.mark.skip(reason='https://github.com/apache/incubator-mxnet/issues/16848')
 def test_np_mixed_precision_binary_funcs():
     itypes = [np.bool, np.int8, np.int32, np.int64]
     ftypes = [np.float16, np.float32, np.float64]
@@ -3084,6 +3083,27 @@ def __init__(self, func):
             def hybrid_forward(self, F, a, b, *args, **kwargs):
                 return getattr(F.np, self._func)(a, b)
 
+        if (func in ['multiply', 'mod', 'equal', 'not_equal', 'greater',
+                    'greater_equal', 'less', 'less_equal']) and \
+            (lshape == () or rshape == ()) :
+        # the behaviors of infer type in dealing with the input shape of '()' are different between np and onp
+        # for example,
+        # mx_test_x1 = np.random.uniform(-2, 2, (2,3)).astype(np.float32)
+        # mx_test_x2 = np.random.uniform(-2, 2, ()).astype(np.float16)
+        # np_out = _np.mod(mx_test_x1.asnumpy(), mx_test_x2.asnumpy()) # float16
+        # mx_out = np.mod(mx_test_x1, mx_test_x2) # float32
+
+        # logcial ops: when two numbers are only different in precision, NumPy also has a weird behavior
+        # for example,
+        # a = np.array([[1.441]], dtype = np.float16)
+        # b = np.array(1.4413278, dtype = np.float32)
+        # c = np.array([1.4413278], dtype = np.float32)
+        # np.greater(a,b), np.greater(a,c) # True True
+        # _np.greater(a.asnumpy(),b.asnumpy()), _np.greater(a.asnumpy(),c.asnumpy()) # False True
+
+        # thus, skip the tests
+            return
+
         np_func = getattr(_np, func)
         mx_func = TestMixedBinary(func)
         np_test_x1 = _np.random.uniform(low, high, lshape).astype(ltype)