From 61f3dbc8daa5e29010447c4d6b929908302cfcd3 Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Fri, 30 Aug 2019 05:40:57 +0800
Subject: [PATCH] numpy-compatible cumsum upstream (#15924)

---
 python/mxnet/_numpy_op_doc.py          |  51 +++++++
 src/operator/numpy/np_cumsum-inl.h     | 188 +++++++++++++++++++++++++
 src/operator/numpy/np_cumsum.cc        |  94 +++++++++++++
 src/operator/numpy/np_cumsum.cu        |  37 +++++
 tests/python/unittest/test_numpy_op.py |  43 +++++-
 5 files changed, 412 insertions(+), 1 deletion(-)
 create mode 100644 src/operator/numpy/np_cumsum-inl.h
 create mode 100644 src/operator/numpy/np_cumsum.cc
 create mode 100644 src/operator/numpy/np_cumsum.cu

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 5543ebc8e8c9..4e5ea74c2771 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -52,3 +52,54 @@ def _np_zeros_like(a):
         Array of zeros with the same shape and type as `a`.
     """
     pass
+
+
+def _np_cumsum(a, axis=None, dtype=None, out=None):
+    """Return the cumulative sum of the elements along a given axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default
+        (None) is to compute the cumsum over the flattened array.
+    dtype : dtype, optional
+        Type of the returned array and of the accumulator in which the
+        elements are summed.  If `dtype` is not specified, it defaults
+        to the dtype of `a`, unless `a` has an integer dtype with a
+        precision less than that of the default platform integer.  In
+        that case, the default platform integer is used.
+    out : ndarray, optional
+        Alternative output array in which to place the result. It must
+        have the same shape and buffer length as the expected output
+        but the type will be cast if necessary. See `doc.ufuncs`
+        (Section "Output arguments") for more details.
+
+    Returns
+    -------
+    cumsum_along_axis : ndarray.
+        A new array holding the result is returned unless `out` is
+        specified, in which case a reference to `out` is returned. The
+        result has the same size as `a`, and the same shape as `a` if
+        `axis` is not None or `a` is a 1-d array.
+
+    Examples
+    --------
+    >>> a = np.array([[1,2,3], [4,5,6]])
+    >>> a
+    array([[1, 2, 3],
+           [4, 5, 6]])
+    >>> np.cumsum(a)
+    array([ 1,  3,  6, 10, 15, 21])
+    >>> np.cumsum(a, dtype=float)     # specifies type of output value(s)
+    array([  1.,   3.,   6.,  10.,  15.,  21.])
+    >>> np.cumsum(a,axis=0)      # sum over rows for each of the 3 columns
+    array([[1, 2, 3],
+           [5, 7, 9]])
+    >>> np.cumsum(a,axis=1)      # sum over columns for each of the 2 rows
+    array([[ 1,  3,  6],
+           [ 4,  9, 15]])
+
+    """
+    pass
diff --git a/src/operator/numpy/np_cumsum-inl.h b/src/operator/numpy/np_cumsum-inl.h
new file mode 100644
index 000000000000..6c6b56d46e76
--- /dev/null
+++ b/src/operator/numpy/np_cumsum-inl.h
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_cumsum-inl.h
+ * \brief Function definition of numpy-compatible cumsum operator
+ */
+
+#ifndef MXNET_OPERATOR_NUMPY_NP_CUMSUM_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_CUMSUM_INL_H_
+
+#include <mxnet/base.h>
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+struct CumsumParam : public dmlc::Parameter<CumsumParam> {
+  dmlc::optional<int> axis;
+  dmlc::optional<int> dtype;
+  DMLC_DECLARE_PARAMETER(CumsumParam) {
+    DMLC_DECLARE_FIELD(axis)
+      .set_default(dmlc::optional<int>())
+      .describe("Axis along which the cumulative sum is computed."
+        " The default (None) is to compute the cumsum over the flattened array.");
+    DMLC_DECLARE_FIELD(dtype)
+      .add_enum("float16", mshadow::kFloat16)
+      .add_enum("float32", mshadow::kFloat32)
+      .add_enum("float64", mshadow::kFloat64)
+      .add_enum("int8", mshadow::kInt8)
+      .add_enum("int32", mshadow::kInt32)
+      .add_enum("int64", mshadow::kInt64)
+      .set_default(dmlc::optional<int>())
+      .describe("Type of the returned array and of the accumulator in which the elements"
+                " are summed. If dtype is not specified, it defaults to the dtype of a,"
+                " unless a has an integer dtype with a precision less than that of the"
+                " default platform integer. In that case, the default platform integer is used.");
+  }
+};
+
+struct cumsum_forward {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  OType *out,
+                                  const IType *in,
+                                  const int middle,
+                                  const int trailing) {
+    int left = i / trailing, right = i % trailing;
+    int offset = left * middle * trailing + right;
+    const IType *lane_in = in + offset;
+    OType *lane_out = out + offset;
+    lane_out[0] = OType(lane_in[0]);
+    for (int j = 1; j < middle; ++j) {
+      lane_out[j * trailing] = lane_out[(j - 1) * trailing] + OType(lane_in[j * trailing]);
+    }
+  }
+};
+
+template<typename xpu>
+void CumsumForwardImpl(const OpContext& ctx,
+                       const TBlob& in,
+                       const TBlob& out,
+                       const dmlc::optional<int>& axis) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+
+  CHECK(!axis.has_value() ||
+        ((axis.value() >= -out.shape_.ndim()) && axis.value() < out.shape_.ndim()))
+    << "axis value " << axis.value() << " out of range";
+
+  int middle = axis.has_value() ? out.shape_[axis.value()] : out.Size();
+  if (middle == 0 || out.Size() == 0) return;
+  int trailing = 1;
+  if (axis.has_value()) {
+    for (int i = axis.value() + 1; i < out.shape_.ndim(); ++i) {
+      trailing *= out.shape_[i];
+    }
+  }
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(in.type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(out.type_flag_, OType, {
+      Kernel<cumsum_forward, xpu>::Launch(
+        s, out.Size() / middle, out.dptr<OType>(),
+        in.dptr<IType>(), middle, trailing);
+    });
+  });
+}
+
+template<typename xpu>
+void CumsumForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const CumsumParam &param = nnvm::get<CumsumParam>(attrs.parsed);
+
+  CumsumForwardImpl<xpu>(ctx, inputs[0], outputs[0], param.axis);
+}
+
+struct cumsum_backward {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  IType *igrad,
+                                  const OType *ograd,
+                                  const int middle,
+                                  const int trailing) {
+    int left = i / trailing, right = i % trailing;
+    int offset = left * middle * trailing + right;
+    const OType *lane_ograd = ograd + offset;
+    IType *lane_igrad = igrad + offset;
+    lane_igrad[(middle - 1) * trailing] = IType(lane_ograd[(middle - 1) * trailing]);
+    for (int j = middle - 2; j >= 0; --j) {
+      lane_igrad[j * trailing] = lane_igrad[(j + 1) * trailing] + IType(lane_ograd[j * trailing]);
+    }
+  }
+};
+
+template<typename xpu>
+void CumsumBackwardImpl(const OpContext& ctx,
+                        const TBlob& ograd,
+                        const TBlob& igrad,
+                        const dmlc::optional<int>& axis) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  int middle = axis.has_value() ? igrad.shape_[axis.value()] : igrad.Size();
+  if (middle == 0 || igrad.Size() == 0) return;
+  int trailing = 1;
+  if (axis.has_value()) {
+    for (int i = axis.value() + 1; i < igrad.shape_.ndim(); ++i) {
+      trailing *= igrad.shape_[i];
+    }
+  }
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(igrad.type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(ograd.type_flag_, OType, {
+      Kernel<cumsum_backward, xpu>::Launch(
+        s, igrad.Size() / middle, igrad.dptr<IType>(),
+        ograd.dptr<OType>(), middle, trailing);
+    });
+  });
+}
+
+template<typename xpu>
+void CumsumBackward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const CumsumParam &param = nnvm::get<CumsumParam>(attrs.parsed);
+
+  CumsumBackwardImpl<xpu>(ctx, inputs[0], outputs[0], param.axis);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_CUMSUM_INL_H_
diff --git a/src/operator/numpy/np_cumsum.cc b/src/operator/numpy/np_cumsum.cc
new file mode 100644
index 000000000000..0ddbf521186c
--- /dev/null
+++ b/src/operator/numpy/np_cumsum.cc
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_cumsum.cc
+ * \brief CPU implementation of numpy-compatible cumsum operator
+ */
+
+#include "./np_cumsum-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool CumsumShape(const nnvm::NodeAttrs& attrs,
+                        mxnet::ShapeVector *in_attrs,
+                        mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const CumsumParam &param = nnvm::get<CumsumParam>(attrs.parsed);
+
+  if (param.axis.has_value()) {
+    return ElemwiseShape<1, 1>(attrs, in_attrs, out_attrs);
+  } else {
+    TShape out_shape(1, in_attrs->at(0).Size());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, out_shape);
+    return shape_is_known(out_attrs->at(0));
+  }
+}
+
+inline bool CumsumType(const nnvm::NodeAttrs& attrs,
+                       std::vector<int> *in_attrs,
+                       std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const CumsumParam &param = nnvm::get<CumsumParam>(attrs.parsed);
+
+  if (param.dtype.has_value()) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  }
+
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(CumsumParam);
+
+NNVM_REGISTER_OP(_np_cumsum)
+.add_alias("cumsum")
+.describe(R"code(Return the cumulative sum of the elements along a given axis.)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<CumsumParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", CumsumShape)
+.set_attr<nnvm::FInferType>("FInferType", CumsumType)
+.set_attr<FCompute>("FCompute<cpu>", CumsumForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_cumsum"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("a", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(CumsumParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_np_cumsum)
+.set_attr_parser(ParamParser<CumsumParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", CumsumBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_cumsum.cu b/src/operator/numpy/np_cumsum.cu
new file mode 100644
index 000000000000..cc574ebf72c5
--- /dev/null
+++ b/src/operator/numpy/np_cumsum.cu
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_cumsum.cu
+ * \brief GPU implementation of numpy-compatible cumsum operator
+ */
+
+#include "./np_cumsum-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_np_cumsum)
+.set_attr<FCompute>("FCompute<gpu>", CumsumForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_cumsum)
+.set_attr<FCompute>("FCompute<gpu>", CumsumBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index c468a6b7a0fa..b8b9ca75a173 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -39,7 +39,7 @@ class TestTensordot(HybridBlock):
         def __init__(self, axes):
             super(TestTensordot, self).__init__()
             self._axes = axes
-            
+
         def hybrid_forward(self, F, a, b):
             return F.np.tensordot(a, b, self._axes)
 
@@ -1365,6 +1365,47 @@ def test_random_seed():
         assert_almost_equal(ret[0].asnumpy(), ret[1].asnumpy(), rtol=1e-4, atol=1e-5, use_broadcast=False)
 
 
+@with_seed()
+@use_np
+def test_np_cumsum():
+    def np_cumsum_backward(ograd, axis=None, dtype=None):
+        return _np.flip(_np.cumsum(_np.flip(ograd, axis=axis), axis=axis, dtype=dtype), axis=axis)
+
+    class TestCumsum(HybridBlock):
+        def __init__(self, axis=None, dtype=None):
+            super(TestCumsum, self).__init__()
+            self._axis = axis
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, a):
+            return F.np.cumsum(a, axis=self._axis, dtype=self._dtype)
+
+    shapes = [(2, 3, 4), (2, 0, 3), ()]
+    for hybridize in [True, False]:
+        for shape in shapes:
+            for axis in [None] + [i for i in range(0, len(shape))]:
+                for otype in [None, _np.float32, _np.float64]:
+                    test_cumsum = TestCumsum(axis=axis, dtype=otype)
+                    if hybridize:
+                        test_cumsum.hybridize()
+                    for itype in [_np.float16, _np.float32, _np.float64]:
+                        x = rand_ndarray(shape).astype(itype).as_np_ndarray()
+                        x.attach_grad()
+                        np_out = _np.cumsum(x.asnumpy(), axis=axis, dtype=otype)
+                        with mx.autograd.record():
+                            mx_out = test_cumsum(x)
+                        assert mx_out.shape == np_out.shape
+                        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+                        mx_out.backward()
+                        np_backward = np_cumsum_backward(_np.ones(np_out.shape, dtype=otype),
+                                                         axis=axis, dtype=otype).reshape(x.shape)
+                        assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5)
+
+                        mx_out = np.cumsum(x, axis=axis, dtype=otype)
+                        np_out = _np.cumsum(x.asnumpy(), axis=axis, dtype=otype)
+                        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()