From 1245499bbff250a55390444bd801e81b8c8612d8 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 22 Sep 2019 10:40:46 +0800
Subject: [PATCH 01/18] fix meansum nan

---
 3rdparty/mshadow/mshadow/base.h        | 92 ++++++++++++++++++++++----
 src/operator/mshadow_op.h              | 46 ++++---------
 tests/python/unittest/test_operator.py | 23 +++----
 3 files changed, 102 insertions(+), 59 deletions(-)
diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index e0e9602c00db..f229dec1a38d 100755
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -606,6 +606,64 @@ struct divto {
   typedef op::div OPType;
 };
 }  // namespace sv
+
+#ifndef __CUDA_ARCH__
+using std::isnan;
+using std::isinf;
+#endif
+
+/*! \brief
+ *  determines if the given floating point
+ *  number is not a number */
+namespace isnan_typed {
+  template<typename DType>
+  MSHADOW_XINLINE bool IsNan(volatile DType val) {
+    return false;
+  }
+  template<>
+  MSHADOW_XINLINE bool IsNan(volatile float val) {
+    return isnan(val);
+  }
+  template<>
+  MSHADOW_XINLINE bool IsNan(volatile double val) {
+    return isnan(val);
+  }
+  template<>
+  MSHADOW_XINLINE bool IsNan(volatile long double val) {
+    return isnan(val);
+  }
+  template<>
+  MSHADOW_XINLINE bool IsNan(volatile mshadow::half::half_t val) {
+    return (val.half_ & 0x7fff) > 0x7c00;
+  }
+}  // namespace isnan_typed
+
+/*! \brief
+ *  determines if the given floating point
+ *  number is a positive or negative infinity */
+namespace isinf_typed {
+  template<typename DType>
+  MSHADOW_XINLINE bool IsInf(volatile DType val) {
+    return false;
+  }
+  template<>
+  MSHADOW_XINLINE bool IsInf(volatile float val) {
+    return isinf(val);
+  }
+  template<>
+  MSHADOW_XINLINE bool IsInf(volatile double val) {
+    return isinf(val);
+  }
+  template<>
+  MSHADOW_XINLINE bool IsInf(volatile long double val) {
+    return isinf(val);
+  }
+  template<>
+  MSHADOW_XINLINE bool IsInf(volatile mshadow::half::half_t val) {
+    return (val.half_ & 0x7fff) == 0x7c00;
+  }
+}  // namespace isinf_typed
+
 /*! \brief namespace for potential reducer operations */
 namespace red {
 namespace limits {
@@ -669,6 +727,11 @@ template<>
 MSHADOW_XINLINE double NegInfValue<double>(void) {
   return -HUGE_VAL;
 }
+/*! \brief negative infinity value of float16 */
+template<>
+MSHADOW_XINLINE half::half_t NegInfValue<half::half_t>(void) {
+  return half::half_t::Binary(0xfc00);
+}
 
 /*!
  * \brief maximum value of certain types
@@ -730,6 +793,11 @@ template<>
 MSHADOW_XINLINE double PosInfValue<double>(void) {
   return HUGE_VAL;
 }
+/*! \brief positive infinity value of float16 */
+template<>
+MSHADOW_XINLINE half::half_t PosInfValue<half::half_t>(void) {
+  return half::half_t::Binary(0x7c00);
+}
 
 }  // namespace limits
 
@@ -745,7 +813,11 @@ struct sum {
   MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src, volatile DType& residual) { // NOLINT(*)
     DType y = src - residual;
     DType t = dst + y;
-    residual = (t - dst) - y;
+    if (isinf_typed::IsInf(t)) {
+      residual = 0;
+    } else {
+      residual = (t - dst) - y;
+    }
     dst = t;
   }
   /*! \brief combine the results of two reducers */
@@ -797,12 +869,9 @@ struct maximum {
   /*! \brief do reduction into dst */
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
-    using namespace std;
-#ifdef __CUDACC__
-    dst = ::max(dst, src);
-#else
-    dst = max(dst, src);
-#endif  // __CUDACC__
+    if (!isnan_typed::IsNan(dst)) {
+      dst = DType(dst > src ? dst : src);
+    }
   }
   /*! \brief do reduction into dst */
   template<typename DType>
@@ -853,12 +922,9 @@ struct minimum {
   /*! \brief do reduction into dst */
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
-    using namespace std;
-#ifdef __CUDACC__
-    dst = ::min(dst, src);
-#else
-    dst = min(dst, src);
-#endif  // __CUDACC__
+    if (!isnan_typed::IsNan(dst)) {
+      dst = DType(dst < src ? dst : src);
+    }
   }
   /*! \brief do reduction into dst */
   template<typename DType>
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index e14c8dbb0b78..08d81f6f443f 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -27,6 +27,7 @@
 #define MXNET_OPERATOR_MSHADOW_OP_H_
 
 #include <mxnet/base.h>
+#include <mshadow/base.h>
 #include "math.h"
 #include "math_functions-inl.h"
 #include "special_functions-inl.h"
@@ -41,6 +42,8 @@ namespace mxnet {
 namespace op {
 namespace mshadow_op {
 
+using mshadow::isnan_typed::IsNan;
+
 #ifdef __CUDA_ARCH__
 __constant__ const float PI = 3.14159265358979323846;
 __constant__ const float SELU_ALPHA = 1.6732632423543772848170429916717;
@@ -51,7 +54,6 @@ const float PI = 3.14159265358979323846;
 const float SELU_ALPHA = 1.6732632423543772848170429916717;
 const float SELU_LAMBDA = 1.0507009873554804934193349852946;
 const float SQRT_2 = 1.4142135623730950488016887242096;
-using std::isnan;
 #endif
 using std::enable_if;
 using std::is_unsigned;
@@ -826,37 +828,13 @@ struct product {
   }
 };
 
-namespace isnan_typed {
-  template<typename DType>
-  MSHADOW_XINLINE bool IsNan(volatile DType val) {
-    return false;
-  }
-  template<>
-  MSHADOW_XINLINE bool IsNan(volatile float val) {
-    return isnan(val);
-  }
-  template<>
-  MSHADOW_XINLINE bool IsNan(volatile double val) {
-    return isnan(val);
-  }
-  template<>
-  MSHADOW_XINLINE bool IsNan(volatile long double val) {
-    return isnan(val);
-  }
-
-  template<>
-  MSHADOW_XINLINE bool IsNan(volatile mshadow::half::half_t val) {
-    return (val.half_ & 0x7fff) > 0x7c00;
-  }
-};  // namespace isnan_typed
-
-MXNET_UNARY_MATH_OP_NC(relu, isnan_typed::IsNan(a) || (a > DType(0)) ? a : DType(0));
+MXNET_UNARY_MATH_OP_NC(relu, IsNan(a) || (a > DType(0)) ? a : DType(0));
 
 /*! \brief used for computing gradient of relu operator */
 struct relu_grad : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a) {
-    if (isnan_typed::IsNan(a)) {
+    if (IsNan(a)) {
       return a;
     } else {
       return a > DType(0) ? DType(1) : DType(0);
@@ -868,7 +846,7 @@ struct relu_grad : public mxnet_op::tunable {
 struct maximum : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    if (isnan_typed::IsNan(a)) {
+    if (IsNan(a)) {
       return a;
     } else {
       return (a > b ? a : b);
@@ -880,7 +858,7 @@ struct maximum : public mxnet_op::tunable {
 struct minimum : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    if (isnan_typed::IsNan(a)) {
+    if (IsNan(a)) {
       return a;
     } else {
       return DType(a < b ? a : b);
@@ -893,13 +871,13 @@ struct nansum {
   /*! \brief do reduction into dst */
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { // NOLINT(*)
-    if (isnan_typed::IsNan(src)) return;
+    if (IsNan(src)) return;
     dst += src;
   }
   /*! \brief do reduction into dst */
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src, volatile DType& residual) { // NOLINT(*)
-    if (isnan_typed::IsNan(src)) return;
+    if (IsNan(src)) return;
     DType y = src - residual;
     DType t = dst + y;
     residual = (t - dst) - y;
@@ -945,7 +923,7 @@ struct nansum {
 struct nansum_grad : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return isnan_typed::IsNan(a) ? DType(0) : DType(1);
+    return IsNan(a) ? DType(0) : DType(1);
   }
 };
 
@@ -954,7 +932,7 @@ struct nanprod {
   /*! \brief do reduction into dst */
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { // NOLINT(*)
-    if (isnan_typed::IsNan(src)) return;
+    if (IsNan(src)) return;
     dst *= src;
   }
   /*! \brief do reduction into dst */
@@ -1128,7 +1106,7 @@ struct sum {
 struct nanprod_grad : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return isnan_typed::IsNan(a) ? DType(0) : b / a;
+    return IsNan(a) ? DType(0) : b / a;
   }
 };
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 66bd9ec6b489..136623630a72 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9244,21 +9244,20 @@ def test_sample_normal_default_shape():
     assert s.shape == (1, 1)
 
 
-def test_min_max_inf():
-    dtypes = [np.float32, np.double]
-    elem_list = [-1, 1, 0, np.inf, -np.inf]
-
+def test_inf_and_nan():
+    dtypes = [np.float16, np.float32, np.double]
+    elem_list = [-1, 1, 0, np.inf, -np.inf, np.nan]
+    op_names = ['min', 'max', 'mean', 'sum']
     for dtype in dtypes:
         for a in elem_list:
             for b in elem_list:
-                data_np = np.array([a, b], dtype=dtype)
-                data_mx = mx.nd.array(data_np, dtype=dtype)
-
-                min_data_np, max_data_np = data_np.min(), data_np.max()
-                min_data_mx, max_data_mx = data_mx.min(), data_mx.max()
-
-                assert_array_equal(min_data_np, min_data_mx.asnumpy())
-                assert_array_equal(max_data_np, max_data_mx.asnumpy())
+                for op_name in op_names:
+                    print(dtype, a, b, op_name)
+                    data_np = np.array([a, b], dtype=dtype)
+                    data_mx = mx.nd.array(data_np, dtype=dtype)
+                    out_data_np = getattr(data_np, op_name)()
+                    out_data_mx = getattr(data_mx, op_name)()
+                    assert_array_equal(out_data_np, out_data_mx.asnumpy())
 
 
 if __name__ == '__main__':

From 100d9c1c8e346ae001d91ba05a5ede703ae885e8 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 22 Sep 2019 10:48:46 +0800
Subject: [PATCH 02/18] remove print in testcase

---
 tests/python/unittest/test_operator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 136623630a72..68e3d4bb0e98 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9252,7 +9252,6 @@ def test_inf_and_nan():
         for a in elem_list:
             for b in elem_list:
                 for op_name in op_names:
-                    print(dtype, a, b, op_name)
                     data_np = np.array([a, b], dtype=dtype)
                     data_mx = mx.nd.array(data_np, dtype=dtype)
                     out_data_np = getattr(data_np, op_name)()

From 4aa2dc6cc45af0fbfdc79e207dd5e5c0c84eab6c Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 22 Sep 2019 10:53:51 +0800
Subject: [PATCH 03/18] update to avoid assignment

---
 3rdparty/mshadow/mshadow/base.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index f229dec1a38d..15d0fdde0d80 100755
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -870,7 +870,7 @@ struct maximum {
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
     if (!isnan_typed::IsNan(dst)) {
-      dst = DType(dst > src ? dst : src);
+      if (!(dst > src)) dst = src;
     }
   }
   /*! \brief do reduction into dst */
@@ -923,7 +923,7 @@ struct minimum {
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
     if (!isnan_typed::IsNan(dst)) {
-      dst = DType(dst < src ? dst : src);
+      if (!(dst < src)) dst = src;
     }
   }
   /*! \brief do reduction into dst */

From 1d557e09056b12f88b38ab08a511f71b8152ac52 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 22 Sep 2019 10:59:50 +0800
Subject: [PATCH 04/18] update

---
 3rdparty/mshadow/mshadow/base.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index 15d0fdde0d80..fa497d3c9cc3 100755
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -870,7 +870,7 @@ struct maximum {
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
     if (!isnan_typed::IsNan(dst)) {
-      if (!(dst > src)) dst = src;
+      if (!(dst >= src)) dst = src;
     }
   }
   /*! \brief do reduction into dst */
@@ -923,7 +923,7 @@ struct minimum {
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
     if (!isnan_typed::IsNan(dst)) {
-      if (!(dst < src)) dst = src;
+      if (!(dst <= src)) dst = src;
     }
   }
   /*! \brief do reduction into dst */

From 1d794479be3e657c10ba99b8b4214fbfe71750ab Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 22 Sep 2019 17:10:26 +0800
Subject: [PATCH 05/18] fix argmin and argmax, update julia unittest

---
 3rdparty/mshadow/mshadow/extension/reduce_with_axis.h | 2 +-
 julia/test/unittest/ndarray.jl                        | 8 ++++----
 tests/python/unittest/test_operator.py                | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/3rdparty/mshadow/mshadow/extension/reduce_with_axis.h b/3rdparty/mshadow/mshadow/extension/reduce_with_axis.h
index 54bcc750cfc5..26b6156ad6f9 100644
--- a/3rdparty/mshadow/mshadow/extension/reduce_with_axis.h
+++ b/3rdparty/mshadow/mshadow/extension/reduce_with_axis.h
@@ -112,7 +112,7 @@ struct Plan<ReduceWithAxisExp<Reducer, SrcExp, DType, dimsrc, mask, dimdst>, DTy
         index_t z = (x*size_+k)*trailing_+y;
         DType tmp = res;
         Reducer::Reduce(res, src_.Eval(z/last_, z%last_));
-        if (tmp != res) {
+        if (tmp != res && !isnan_typed::IsNan(tmp)) {
           idx = k;
         }
       }
diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
index 638963f1b8aa..ac91e8d0f76e 100644
--- a/julia/test/unittest/ndarray.jl
+++ b/julia/test/unittest/ndarray.jl
@@ -1525,8 +1525,8 @@ function test_argmax()
          NaN 2 6]
     x = NDArray(A)
 
-    @test copy(argmax(x, dims = 1)) == [1 1 2]
-    @test copy(argmax(x, dims = 2)) == reshape([2, 3], :, 1)
+    @test copy(argmax(x, dims = 1)) == [2 1 2]
+    @test copy(argmax(x, dims = 2)) == reshape([2, 1], :, 1)
   end
 end
 
@@ -1547,8 +1547,8 @@ function test_argmin()
          NaN 2 6]
     x = NDArray(A)
 
-    @test copy(argmin(x, dims = 1)) == [1 2 1]
-    @test copy(argmin(x, dims = 2)) == reshape([1, 2], :, 1)
+    @test copy(argmin(x, dims = 1)) == [2 2 1]
+    @test copy(argmin(x, dims = 2)) == reshape([1, 1], :, 1)
   end
 end
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 68e3d4bb0e98..ee2598cf5c84 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9247,7 +9247,7 @@ def test_sample_normal_default_shape():
 def test_inf_and_nan():
     dtypes = [np.float16, np.float32, np.double]
     elem_list = [-1, 1, 0, np.inf, -np.inf, np.nan]
-    op_names = ['min', 'max', 'mean', 'sum']
+    op_names = ['min', 'max', 'mean', 'sum', 'argmin', 'argmax']
     for dtype in dtypes:
         for a in elem_list:
             for b in elem_list:

From 9b744d5a5a10c007956069f7c294e30c92725662 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 22 Sep 2019 18:39:38 +0800
Subject: [PATCH 06/18] update argmin/argmax docs in julia bindings

---
 julia/src/ndarray/reduction.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/julia/src/ndarray/reduction.jl b/julia/src/ndarray/reduction.jl
index 833b483ca321..2045ce231674 100644
--- a/julia/src/ndarray/reduction.jl
+++ b/julia/src/ndarray/reduction.jl
@@ -47,8 +47,7 @@ broadcasted(::typeof(min), x::NDArray{T}, y::NDArray{T}) where {T} =
 """
     argmax(x::NDArray; dims) -> indices
 
-Note that `NaN` is skipped during comparison.
-This is different from Julia `Base.argmax`.
+Note that `NaN` is treated as greater than all other values in `argmax`.
 
 ## Examples
 
@@ -77,8 +76,7 @@ Base.argmax(x::NDArray; dims = :) = _argmax(x, dims) .+ 1
 """
     argmin(x::NDArray; dims) -> indices
 
-Note that `NaN` is skipped during comparison.
-This is different from Julia `Base.argmin`.
+Note that `NaN` is treated as less than all other values in `argmin`.
 
 ## Examples
 

From 66280f1870ac4f1bbfa9f9f9781aa22fe29bb46d Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 22 Sep 2019 21:56:39 +0800
Subject: [PATCH 07/18] debug

---
 tests/python/unittest/test_operator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ee2598cf5c84..de843ec173b4 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9252,6 +9252,7 @@ def test_inf_and_nan():
         for a in elem_list:
             for b in elem_list:
                 for op_name in op_names:
+                    print(dtype, a, b, op_name)
                     data_np = np.array([a, b], dtype=dtype)
                     data_mx = mx.nd.array(data_np, dtype=dtype)
                     out_data_np = getattr(data_np, op_name)()

From 848c57b84618c7af14c982bc80ad28a7d975eb3c Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 22 Sep 2019 23:26:19 +0800
Subject: [PATCH 08/18] update

---
 tests/python/unittest/test_operator.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index de843ec173b4..d75535042b94 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9248,16 +9248,20 @@ def test_inf_and_nan():
     dtypes = [np.float16, np.float32, np.double]
     elem_list = [-1, 1, 0, np.inf, -np.inf, np.nan]
     op_names = ['min', 'max', 'mean', 'sum', 'argmin', 'argmax']
+    record = []
     for dtype in dtypes:
         for a in elem_list:
             for b in elem_list:
                 for op_name in op_names:
-                    print(dtype, a, b, op_name)
                     data_np = np.array([a, b], dtype=dtype)
                     data_mx = mx.nd.array(data_np, dtype=dtype)
                     out_data_np = getattr(data_np, op_name)()
                     out_data_mx = getattr(data_mx, op_name)()
-                    assert_array_equal(out_data_np, out_data_mx.asnumpy())
+                    try:
+                        assert_array_equal(out_data_np, out_data_mx.asnumpy())
+                    except AssertionError:
+                        record.append((dtype, a, b, op_name))
+    assert len(record) == 0, record
 
 
 if __name__ == '__main__':

From 4de6bacf27ae2ec6f54aef245b3b08e33ae3272a Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Mon, 23 Sep 2019 00:03:27 +0800
Subject: [PATCH 09/18] update test

---
 tests/python/unittest/test_operator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d75535042b94..c80c56d0dd88 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9259,8 +9259,10 @@ def test_inf_and_nan():
                     out_data_mx = getattr(data_mx, op_name)()
                     try:
                         assert_array_equal(out_data_np, out_data_mx.asnumpy())
-                    except AssertionError:
-                        record.append((dtype, a, b, op_name))
+                    except AssertionError as e:
+                        args = (dtype, a, b, op_name)
+                        print(args, e, '\n---------\n')
+                        record.append(args)
     assert len(record) == 0, record
 
 

From 9c3a72cf608ace147127e09d0273c3cae0de5a15 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Mon, 23 Sep 2019 00:25:37 +0800
Subject: [PATCH 10/18] fix sum merge

---
 3rdparty/mshadow/mshadow/base.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index fa497d3c9cc3..a55c3dae26da 100755
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -829,10 +829,15 @@ struct sum {
   template<typename DType>
   MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& dst_residual, volatile DType& src_val, volatile DType& src_residual) { // NOLINT(*)
     DType t1 = dst_val + src_val;
-    DType e = t1 - dst_val;
-    DType t2 = ((src_val - e) + (dst_val - (t1 - e))) + dst_residual + src_residual;
-    dst_val = t1 + t2;
-    dst_residual = t2 - (dst_val - t1);
+    if (isinf_typed::IsInf(t1)) {
+      dst_val = t1;
+      dst_residual = 0;
+    } else {
+      DType e = t1 - dst_val;
+      DType t2 = ((src_val - e) + (dst_val - (t1 - e))) + dst_residual + src_residual;
+      dst_val = t1 + t2;
+      dst_residual = t2 - (dst_val - t1);
+    }
   }
   /*! \brief finalize reduction */
   template<typename DType>

From 9f2e4fc7f2a37702036706c0f1ac0acac897979e Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Tue, 24 Sep 2019 11:10:43 +0800
Subject: [PATCH 11/18] update testcase

---
 julia/test/unittest/ndarray.jl         | 16 ++++----
 python/mxnet/ndarray/ndarray.py        |  1 +
 tests/python/unittest/test_ndarray.py  | 54 +++++++++++++++++++++-----
 tests/python/unittest/test_operator.py | 22 -----------
 4 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
index ac91e8d0f76e..5d18ac8ac4b3 100644
--- a/julia/test/unittest/ndarray.jl
+++ b/julia/test/unittest/ndarray.jl
@@ -1515,8 +1515,8 @@ function test_argmax()
          4 2 6]
     x = NDArray(A)
 
-    @test copy(argmax(x, dims = 1)) == [2 1 2]
-    @test copy(argmax(x, dims = 2)) == reshape([2, 3], :, 1)
+    @test copy(argmax(x, dims = 1)) == [x[1] for x in argmax(A, dims = 1)]
+    @test copy(argmax(x, dims = 2)) == [x[2] for x in argmax(A, dims = 2)]
   end
 
   @info "NDArray::argmax::NaN"
@@ -1525,8 +1525,8 @@ function test_argmax()
          NaN 2 6]
     x = NDArray(A)
 
-    @test copy(argmax(x, dims = 1)) == [2 1 2]
-    @test copy(argmax(x, dims = 2)) == reshape([2, 1], :, 1)
+    @test copy(argmax(x, dims = 1)) == [x[1] for x in argmax(A, dims = 1)]
+    @test copy(argmax(x, dims = 2)) == [x[2] for x in argmax(A, dims = 2)]
   end
 end
 
@@ -1537,8 +1537,8 @@ function test_argmin()
          4 2 6]
     x = NDArray(A)
 
-    @test copy(argmin(x, dims = 1)) == [1 2 1]
-    @test copy(argmin(x, dims = 2)) == reshape([1, 2], :, 1)
+    @test copy(argmin(x, dims = 1)) == [x[1] for x in argmin(A, dims = 1)]
+    @test copy(argmin(x, dims = 2)) == [x[2] for x in argmin(A, dims = 2)]
   end
 
   @info "NDArray::argmin::NaN"
@@ -1547,8 +1547,8 @@ function test_argmin()
          NaN 2 6]
     x = NDArray(A)
 
-    @test copy(argmin(x, dims = 1)) == [2 2 1]
-    @test copy(argmin(x, dims = 2)) == reshape([1, 1], :, 1)
+    @test copy(argmin(x, dims = 1)) == [x[1] for x in argmin(A, dims = 1)]
+    @test copy(argmin(x, dims = 2)) == [x[2] for x in argmin(A, dims = 2)]
   end
 end
 
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 4e3c7efa7be3..162687c9bbeb 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -4909,6 +4909,7 @@ class DLDataType(ctypes.Structure):
         "bool": (1, 1, 1),
         "uint32": (1, 32, 1),
         "uint64": (1, 64, 1),
+        'float16': (2, 16, 1),
         "float32": (2, 32, 1),
         "float64": (2, 64, 1),
     }
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index bee4bff0f7c0..d81459a83504 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -21,6 +21,7 @@
 from itertools import permutations, combinations_with_replacement
 import os
 import pickle as pkl
+import random
 import functools
 from nose.tools import assert_raises, raises
 from common import with_seed, assertRaises, TemporaryDirectory
@@ -31,7 +32,7 @@
 from mxnet.test_utils import same
 from mxnet.test_utils import random_sample, rand_shape_nd, random_arrays
 from mxnet import runtime
-from numpy.testing import assert_allclose
+from numpy.testing import assert_allclose, assert_array_equal, assert_array_almost_equal
 import mxnet.autograd
 
 
@@ -578,13 +579,40 @@ def test_dot():
 
 @with_seed()
 def test_reduce():
-    sample_num = 200
-    def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes):
+    sample_num = 300
+    def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes,
+                          allow_almost_equal=False, check_dtype=True):
+        dtypes = [(np.float16, 1),
+                  (np.float32, 5),
+                  (np.double, 6)]
         for i in range(sample_num):
+            dtype, decimal = random.choice(dtypes)
             ndim = np.random.randint(1, 6)
             shape = np.random.randint(1, 11, size=ndim)
-            dat = np.random.rand(*shape) - 0.5
+            dat = (np.random.rand(*shape) - 0.5).astype(dtype)
             keepdims = np.random.randint(0, 2)
+
+            allow_nan = np.random.randint(0, 2)
+            if allow_nan:
+                total_nans = np.random.randint(0, dat.size//10+1)
+                dat.ravel()[np.random.choice(
+                    dat.size, total_nans, replace=False)] = np.nan
+
+            allow_inf = np.random.randint(0, 2)
+            if allow_inf:
+                r = np.random.randint(0, 3)
+                total_infs = np.random.randint(0, dat.size//20+1)
+                if r == 0:
+                    total_pos_infs, total_neg_infs = total_infs, 0
+                elif r == 1:
+                    total_pos_infs, total_neg_infs = 0, total_infs
+                else:
+                    total_pos_infs = total_neg_infs = total_infs // 2
+                dat.ravel()[np.random.choice(
+                    dat.size, total_pos_infs, replace=False)] = np.inf
+                dat.ravel()[np.random.choice(
+                    dat.size, total_neg_infs, replace=False)] = -np.inf
+
             if multi_axes:
                 axis_flags = np.random.randint(0, 2, size=ndim)
                 axes = []
@@ -599,16 +627,22 @@ def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes):
                 axes = np.random.randint(0, ndim)
             numpy_ret = numpy_reduce_func(dat, axis=axes, keepdims=keepdims)
 
-            ndarray_ret = nd_reduce_func(mx.nd.array(dat), axis=axes, keepdims=keepdims)
+            mx_arr = mx.nd.array(dat, dtype=dtype)
+            ndarray_ret = nd_reduce_func(mx_arr, axis=axes, keepdims=keepdims)
             if type(ndarray_ret) is mx.ndarray.NDArray:
                 ndarray_ret = ndarray_ret.asnumpy()
             assert (ndarray_ret.shape == numpy_ret.shape) or \
                    (ndarray_ret.shape == (1,) and numpy_ret.shape == ()), "nd:%s, numpy:%s" \
                                                          %(ndarray_ret.shape, numpy_ret.shape)
-            err = np.square(ndarray_ret - numpy_ret).mean()
-            assert err < 1E-4
+            if check_dtype:
+                assert ndarray_ret.dtype == numpy_ret.dtype,\
+                        (ndarray_ret.dtype, numpy_ret.dtype)
+            if allow_almost_equal:
+                assert_array_almost_equal(ndarray_ret, numpy_ret, decimal=decimal)
+            else:
+                assert_array_equal(ndarray_ret, numpy_ret)
     test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.sum),
-                      mx.nd.sum, True)
+                      mx.nd.sum, True, allow_almost_equal=True)
     test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.max),
                       mx.nd.max, True)
     test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.min),
@@ -617,10 +651,10 @@ def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes):
     # Force numpy to match mxnet's float32.
     test_reduce_inner(lambda data, axis,
                              keepdims:np_reduce(np.float32(data), axis, keepdims, np.argmax),
-                      mx.nd.argmax, False)
+                      mx.nd.argmax, False, check_dtype=False)
     test_reduce_inner(lambda data, axis,
                              keepdims:np_reduce(np.float32(data), axis, keepdims, np.argmin),
-                      mx.nd.argmin, False)
+                      mx.nd.argmin, False, check_dtype=False)
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index c80c56d0dd88..8a2d566d9c52 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9244,28 +9244,6 @@ def test_sample_normal_default_shape():
     assert s.shape == (1, 1)
 
 
-def test_inf_and_nan():
-    dtypes = [np.float16, np.float32, np.double]
-    elem_list = [-1, 1, 0, np.inf, -np.inf, np.nan]
-    op_names = ['min', 'max', 'mean', 'sum', 'argmin', 'argmax']
-    record = []
-    for dtype in dtypes:
-        for a in elem_list:
-            for b in elem_list:
-                for op_name in op_names:
-                    data_np = np.array([a, b], dtype=dtype)
-                    data_mx = mx.nd.array(data_np, dtype=dtype)
-                    out_data_np = getattr(data_np, op_name)()
-                    out_data_mx = getattr(data_mx, op_name)()
-                    try:
-                        assert_array_equal(out_data_np, out_data_mx.asnumpy())
-                    except AssertionError as e:
-                        args = (dtype, a, b, op_name)
-                        print(args, e, '\n---------\n')
-                        record.append(args)
-    assert len(record) == 0, record
-
-
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 714951c06ceee16eb9414b3338fd5d8bc4be0192 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Tue, 24 Sep 2019 13:32:32 +0800
Subject: [PATCH 12/18] update including sign

---
 julia/test/unittest/ndarray.jl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
index 5d18ac8ac4b3..599b0a65bfc4 100644
--- a/julia/test/unittest/ndarray.jl
+++ b/julia/test/unittest/ndarray.jl
@@ -1515,8 +1515,8 @@ function test_argmax()
          4 2 6]
     x = NDArray(A)
 
-    @test copy(argmax(x, dims = 1)) == [x[1] for x in argmax(A, dims = 1)]
-    @test copy(argmax(x, dims = 2)) == [x[2] for x in argmax(A, dims = 2)]
+    @test copy(argmax(x, dims = 1)) == [x[1] for x ∈ argmax(A, dims = 1)]
+    @test copy(argmax(x, dims = 2)) == [x[2] for x ∈ argmax(A, dims = 2)]
   end
 
   @info "NDArray::argmax::NaN"
@@ -1525,8 +1525,8 @@ function test_argmax()
          NaN 2 6]
     x = NDArray(A)
 
-    @test copy(argmax(x, dims = 1)) == [x[1] for x in argmax(A, dims = 1)]
-    @test copy(argmax(x, dims = 2)) == [x[2] for x in argmax(A, dims = 2)]
+    @test copy(argmax(x, dims = 1)) == [x[1] for x ∈ argmax(A, dims = 1)]
+    @test copy(argmax(x, dims = 2)) == [x[2] for x ∈ argmax(A, dims = 2)]
   end
 end
 
@@ -1537,8 +1537,8 @@ function test_argmin()
          4 2 6]
     x = NDArray(A)
 
-    @test copy(argmin(x, dims = 1)) == [x[1] for x in argmin(A, dims = 1)]
-    @test copy(argmin(x, dims = 2)) == [x[2] for x in argmin(A, dims = 2)]
+    @test copy(argmin(x, dims = 1)) == [x[1] for x ∈ argmin(A, dims = 1)]
+    @test copy(argmin(x, dims = 2)) == [x[2] for x ∈ argmin(A, dims = 2)]
   end
 
   @info "NDArray::argmin::NaN"
@@ -1547,8 +1547,8 @@ function test_argmin()
          NaN 2 6]
     x = NDArray(A)
 
-    @test copy(argmin(x, dims = 1)) == [x[1] for x in argmin(A, dims = 1)]
-    @test copy(argmin(x, dims = 2)) == [x[2] for x in argmin(A, dims = 2)]
+    @test copy(argmin(x, dims = 1)) == [x[1] for x ∈ argmin(A, dims = 1)]
+    @test copy(argmin(x, dims = 2)) == [x[2] for x ∈ argmin(A, dims = 2)]
   end
 end
 

From 6666bd6834454786fa9baf6ebd8e4274024c3c19 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Thu, 17 Oct 2019 17:14:15 +0800
Subject: [PATCH 13/18] fix allclose

---
 src/operator/contrib/allclose_op-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/contrib/allclose_op-inl.h b/src/operator/contrib/allclose_op-inl.h
index a858450f0007..32a4244484a0 100644
--- a/src/operator/contrib/allclose_op-inl.h
+++ b/src/operator/contrib/allclose_op-inl.h
@@ -84,7 +84,7 @@ inline bool AllCloseType(const nnvm::NodeAttrs& attrs,
   return (*out_attrs)[0] != -1;
 }
 
-using namespace mshadow_op::isnan_typed;
+using mshadow::isnan_typed::IsNan;
 
 template<int req>
 struct allclose_forward {

From dca0c2a02ca78984eede93a4a81283ea6dd5e66b Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Fri, 18 Oct 2019 01:14:39 +0800
Subject: [PATCH 14/18] ci


From 9183cb3a2284a7d920bb720a19c38dc49d66fdea Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Tue, 12 Nov 2019 09:39:54 +0800
Subject: [PATCH 15/18] use constants

---
 3rdparty/mshadow/mshadow/base.h | 9 +++++----
 3rdparty/mshadow/mshadow/half.h | 2 ++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index a55c3dae26da..acaeb1bb1890 100755
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -634,7 +634,7 @@ namespace isnan_typed {
   }
   template<>
   MSHADOW_XINLINE bool IsNan(volatile mshadow::half::half_t val) {
-    return (val.half_ & 0x7fff) > 0x7c00;
+    return (val.half_ & (~MSHADOW_HALF_SIGN_BIT)) > MSHADOW_HALF_EXPONENT_BITS;
   }
 }  // namespace isnan_typed
 
@@ -660,7 +660,7 @@ namespace isinf_typed {
   }
   template<>
   MSHADOW_XINLINE bool IsInf(volatile mshadow::half::half_t val) {
-    return (val.half_ & 0x7fff) == 0x7c00;
+    return (val.half_ & (~MSHADOW_HALF_SIGN_BIT)) == MSHADOW_HALF_EXPONENT_BITS;
   }
 }  // namespace isinf_typed
 
@@ -730,7 +730,8 @@ MSHADOW_XINLINE double NegInfValue<double>(void) {
 /*! \brief negative infinity value of float16 */
 template<>
 MSHADOW_XINLINE half::half_t NegInfValue<half::half_t>(void) {
-  return half::half_t::Binary(0xfc00);
+  return half::half_t::Binary(
+      MSHADOW_HALF_SIGN_BIT | MSHADOW_HALF_EXPONENT_BITS);
 }
 
 /*!
@@ -796,7 +797,7 @@ MSHADOW_XINLINE double PosInfValue<double>(void) {
 /*! \brief positive infinity value of float16 */
 template<>
 MSHADOW_XINLINE half::half_t PosInfValue<half::half_t>(void) {
-  return half::half_t::Binary(0x7c00);
+  return half::half_t::Binary(MSHADOW_HALF_EXPONENT_BITS);
 }
 
 }  // namespace limits
diff --git a/3rdparty/mshadow/mshadow/half.h b/3rdparty/mshadow/mshadow/half.h
index 2dded0a7752e..1cc53ae0460f 100644
--- a/3rdparty/mshadow/mshadow/half.h
+++ b/3rdparty/mshadow/mshadow/half.h
@@ -349,6 +349,8 @@ MSHADOW_HALF_OPERATOR(bool, <=)
 
 #define MSHADOW_HALF_MIN mshadow::half::half_t::Binary(0xFBFF);
 #define MSHADOW_HALF_MAX mshadow::half::half_t::Binary(0x7BFF);
+#define MSHADOW_HALF_SIGN_BIT 0x8000
+#define MSHADOW_HALF_EXPONENT_BITS 0x7c00
 }  // namespace half
 }  // namespace mshadow
 #endif  // MSHADOW_HALF_H_

From 3b51b794036bea9fbb9ae15eb7b34a93286725fa Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Tue, 12 Nov 2019 11:08:03 +0800
Subject: [PATCH 16/18] fix build for isinf and isnan

---
 src/operator/mshadow_op.h               |  2 +-
 src/operator/tensor/elemwise_unary_op.h | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 4b00af1811d9..4ae587188d1b 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -43,7 +43,7 @@ namespace op {
 namespace mshadow_op {
 
 using mshadow::isnan_typed::IsNan;
-using mshadow::isnan_typed::IsInf;
+using mshadow::isinf_typed::IsInf;
 
 #ifdef __CUDA_ARCH__
 __constant__ const float PI = 3.14159265358979323846;
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 27013dfb98ae..577c994a8ee1 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -699,9 +699,9 @@ struct nan_to_num_forward {
                                   const DType posinf,
                                   const DType neginf) {
     DType val = in_data[i];
-    if (mshadow_op::isnan_typed::IsNan<DType>(val))  val = nan;
-    if (val > 0 && mshadow_op::isinf_typed::IsInf(val))  val = posinf;
-    if (val < 0 && mshadow_op::isinf_typed::IsInf(val))  val = neginf;
+    if (mshadow_op::IsNan<DType>(val))  val = nan;
+    if (val > 0 && mshadow_op::IsInf(val))  val = posinf;
+    if (val < 0 && mshadow_op::IsInf(val))  val = neginf;
     KERNEL_ASSIGN(out_data[i], req, val);
   }
 };
@@ -758,9 +758,9 @@ struct nan_to_num_backward {
                                   const DType* out_grad,
                                   const DType* in_data) {
     DType val = out_grad[i];
-    if (mshadow_op::isnan_typed::IsNan(in_data[i]))  val = 0;
-    if (val > 0 && mshadow_op::isinf_typed::IsInf(in_data[i]))  val = 0;
-    if (val < 0 && mshadow_op::isinf_typed::IsInf(in_data[i]))  val = 0;
+    if (mshadow_op::IsNan(in_data[i]))  val = 0;
+    if (val > 0 && mshadow_op::IsInf(in_data[i]))  val = 0;
+    if (val < 0 && mshadow_op::IsInf(in_data[i]))  val = 0;
     KERNEL_ASSIGN(in_grad[i], req, val);
   }
 };

From c8761ada298ed823cd59b67e9683b87bdc8ccba7 Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Tue, 12 Nov 2019 14:53:26 +0800
Subject: [PATCH 17/18] ci


From 3a2f062c5e5809b9abf24b9331fd597d09f02490 Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Tue, 12 Nov 2019 19:05:13 +0800
Subject: [PATCH 18/18] ci