From b37713012a268f35e809fcfa29f5d8df1b5e2780 Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Sat, 18 May 2019 18:27:14 -0700
Subject: [PATCH] implementation for equivalence of tf.moments (#14842)

---
 src/operator/nn/moments-inl.h          | 254 +++++++++++++++++++++++++
 src/operator/nn/moments.cc             |  85 +++++++++
 src/operator/nn/moments.cu             |  39 ++++
 tests/python/unittest/test_operator.py |  28 +++
 4 files changed, 406 insertions(+)
 create mode 100644 src/operator/nn/moments-inl.h
 create mode 100644 src/operator/nn/moments.cc
 create mode 100644 src/operator/nn/moments.cu
diff --git a/src/operator/nn/moments-inl.h b/src/operator/nn/moments-inl.h
new file mode 100644
index 000000000000..6a9bdc54b905
--- /dev/null
+++ b/src/operator/nn/moments-inl.h
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file moments-inl.h
+ * \brief Moments operator
+ * \author Hao Jin
+*/
+
+#ifndef MXNET_OPERATOR_NN_MOMENTS_INL_H_
+#define MXNET_OPERATOR_NN_MOMENTS_INL_H_
+
+#include <vector>
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct MomentsParam : public dmlc::Parameter<MomentsParam> {
+  dmlc::optional<mxnet::TShape> axes;
+  bool keepdims;
+  DMLC_DECLARE_PARAMETER(MomentsParam) {
+    DMLC_DECLARE_FIELD(axes).set_default(dmlc::optional<mxnet::TShape>())
+      .describe("Array of ints. Axes along which to compute mean and variance.");
+    DMLC_DECLARE_FIELD(keepdims).set_default(false)
+      .describe("produce moments with the same dimensionality as the input.");
+  }
+};
+
+inline bool MomentsShape(const nnvm::NodeAttrs& attrs,
+                         mxnet::ShapeVector* in_attrs,
+                         mxnet::ShapeVector* out_attrs) {
+  const MomentsParam& param = nnvm::get<MomentsParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 2U);
+
+  mxnet::TShape out_shape =
+    ReduceAxesShapeImpl((*in_attrs)[0], param.axes, param.keepdims, false);
+  if (!param.axes.has_value() || param.axes.value().ndim() == 0) {
+    LOG(FATAL) << "Empty axes is not supported, if you would like to do global moments, "
+               << "please pass all axes to axes argument";
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, out_shape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, out_shape);
+  return true;
+}
+
+inline bool MomentsType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int>* in_attrs,
+                        std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 2U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*out_attrs, 1, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(1));
+  return out_attrs->at(0) != -1 && out_attrs->at(1) != -1;
+}
+
+struct VarBroadcastKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType *out,
+                                  const DType *data,
+                                  const DType *mean,
+                                  mshadow::Shape<6> data_shape,
+                                  mshadow::Shape<6> mean_shape) {
+    size_t data_idx = i;
+    size_t mean_idx = i;
+    size_t data_stride = 1;
+    size_t mean_stride = 1;
+    for (int axis = 5; axis >= 0; --axis) {
+      size_t axis_idx = data_idx % data_shape[axis];
+      mean_idx -= axis_idx * data_stride;
+      if (mean_shape[axis] != 1) {
+        mean_idx += axis_idx * mean_stride;
+      }
+      data_idx /= data_shape[axis];
+      data_stride *= data_shape[axis];
+      mean_stride *= mean_shape[axis];
+    }
+    DType res = (data[i] - mean[mean_idx]);
+    out[i] = res * res;
+  }
+};
+
+template<typename xpu>
+inline void MomentsForwardImpl(const OpContext& ctx,
+                               const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs,
+                               const dmlc::optional<mxnet::TShape>& axes,
+                               const bool keepdims) {
+  using namespace mshadow;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  const TBlob& data = inputs[0];
+  const TBlob& mean = outputs[0];
+  const TBlob& var = outputs[1];
+
+  mxnet::TShape small;
+  if (keepdims) {
+    small = outputs[0].shape_;
+  } else {
+    small = ReduceAxesShapeImpl(inputs[0].shape_, axes, true, false);
+  }
+
+  ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, true>(ctx, {data}, {req[0]}, {mean}, small);
+  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
+    Shape<6> data_shape, mean_shape;
+    for (int i = 0; i < 6; ++i) {
+      data_shape[i] = (i < data.shape_.ndim()) ? data.shape_[i] : 1;
+      mean_shape[i] = (i < small.ndim()) ? small[i] : 1;
+    }
+    Tensor<xpu, 1, DType> temp_data =
+      ctx.requested[0].get_space_typed<xpu, 1, DType>(Shape1(data.shape_.Size()), s);;
+    Kernel<VarBroadcastKernel, xpu>::Launch(s, data.shape_.Size(), temp_data.dptr_,
+      data.dptr<DType>(), mean.dptr<DType>(), data_shape, mean_shape);
+    ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, true>(
+      ctx, {TBlob(temp_data).reshape(data.shape_)}, {kWriteTo}, {var}, small);
+  });
+}
+
+template<typename xpu>
+inline void MomentsForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U);
+
+  const MomentsParam& param = nnvm::get<MomentsParam>(attrs.parsed);
+
+  MomentsForwardImpl<xpu>(ctx, inputs, req, outputs, param.axes, param.keepdims);
+}
+
+template<int req>
+struct VarBackwardKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType *igrad,
+                                  const DType *ograd,
+                                  const DType *data,
+                                  const DType *mean,
+                                  mshadow::Shape<6> data_shape,
+                                  mshadow::Shape<6> mean_shape,
+                                  const float N,
+                                  const float ddof = 0.0f) {
+    size_t data_idx = i;
+    size_t mean_idx = i;
+    size_t data_stride = 1;
+    size_t mean_stride = 1;
+    for (int axis = 5; axis >= 0; --axis) {
+      size_t axis_idx = data_idx % data_shape[axis];
+      mean_idx -= axis_idx * data_stride;
+      if (mean_shape[axis] != 1) {
+        mean_idx += axis_idx * mean_stride;
+      }
+      data_idx /= data_shape[axis];
+      data_stride *= data_shape[axis];
+      mean_stride *= mean_shape[axis];
+    }
+    KERNEL_ASSIGN(igrad[i], req, ograd[mean_idx] * (data[i] - mean[mean_idx]) * 2 / (N - ddof));
+  }
+};
+
+template<typename xpu>
+inline void MomentsBackwardImpl(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs,
+                                const dmlc::optional<mxnet::TShape>& axes) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  const TBlob& mean_grad = inputs[0];
+  const TBlob& var_grad = inputs[1];
+  const TBlob& data = inputs[2];
+  const TBlob& mean = inputs[3];
+  const TBlob& var = inputs[4];
+  const TBlob& data_grad = outputs[0];
+
+  mxnet::TShape small = ReduceAxesShapeImpl(data.shape_, axes, true, false);
+  BroadcastComputeImpl<xpu>(attrs, ctx, {mean_grad}, req, outputs, small);
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 1, DType> igrad = outputs[0].FlatTo1D<xpu, DType>(s);
+    igrad /= scalar<DType>(outputs[0].Size()/inputs[0].Size());
+  });
+
+  Shape<6> data_shape, var_shape;
+  float N = data_grad.Size() / var.Size();
+  for (int i = 0; i < 6; ++i) {
+    data_shape[i] = (i < data.shape_.ndim()) ? data.shape_[i] : 1;
+    var_shape[i] = (i < small.ndim()) ? small[i] : 1;
+  }
+  MSHADOW_TYPE_SWITCH(data_grad.type_flag_, DType, {
+    Kernel<VarBackwardKernel<kAddTo>, xpu>::Launch(
+      s, data_grad.shape_.Size(), data_grad.dptr<DType>(), var_grad.dptr<DType>(),
+      data.dptr<DType>(), mean.dptr<DType>(), data_shape, var_shape, N);
+  });
+}
+
+template<typename xpu>
+inline void MomentsBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+
+  CHECK_EQ(inputs.size(), 5U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const MomentsParam& param = nnvm::get<MomentsParam>(attrs.parsed);
+
+  MomentsBackwardImpl<xpu>(attrs, ctx, inputs, req, outputs, param.axes);
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_NN_MOMENTS_INL_H_
diff --git a/src/operator/nn/moments.cc b/src/operator/nn/moments.cc
new file mode 100644
index 000000000000..37b8cdf18750
--- /dev/null
+++ b/src/operator/nn/moments.cc
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file moments.cc
+ * \brief Moments operator
+ * \author Hao Jin
+*/
+
+#include "./moments-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(MomentsParam);
+
+NNVM_REGISTER_OP(moments)
+.describe(R"code(
+Calculate the mean and variance of `data`.
+
+The mean and variance are calculated by aggregating the contents of data across axes.
+If x is 1-D and axes = [0] this is just the mean and variance of a vector.
+
+Example:
+
+     x = [[1, 2, 3], [4, 5, 6]]
+     mean, var = moments(data=x, axes=[0])
+     mean = [2.5, 3.5, 4.5]
+     var = [2.25, 2.25, 2.25]
+     mean, var = moments(data=x, axes=[1])
+     mean = [2.0, 5.0]
+     var = [0.66666667, 0.66666667]
+     mean, var = moments(data=x, axis=[0, 1])
+     mean = [3.5]
+     var = [2.9166667]
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<MomentsParam>)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", MomentsShape)
+.set_attr<nnvm::FInferType>("FInferType", MomentsType)
+.set_attr<FCompute>("FCompute<cpu>", MomentsForward<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_moments"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(MomentsParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_moments)
+.set_attr_parser(ParamParser<MomentsParam>)
+.set_num_inputs(5)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", MomentsBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/moments.cu b/src/operator/nn/moments.cu
new file mode 100644
index 000000000000..a45ae33281be
--- /dev/null
+++ b/src/operator/nn/moments.cu
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file moments.cu
+ * \brief Moments operator
+ * \author Hao Jin
+*/
+
+#include "./moments-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(moments)
+.set_attr<FCompute>("FCompute<gpu>", MomentsForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_moments)
+.set_attr<FCompute>("FCompute<gpu>", MomentsBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 01798f58f16e..90d6b50e71c5 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -8180,6 +8180,34 @@ def test_split_v2():
     check_symbolic_backward(sym, {"data": mx_data}, out_grad, [np.concatenate(out_grad, axis=axis)])
 
 
+@with_seed()
+def test_moments():
+    dim = random.randint(2, 5)
+    shape = rand_shape_nd(dim, dim=5)
+    axes = [i for i in range(dim)]
+    test_dims = random.sample(axes, random.randint(1, dim))
+    test_axes = tuple(sorted(test_dims))
+    np_a = np.random.uniform(-1.0, 1.0, shape)
+    a = mx.nd.array(np_a)
+    for keepdims in [True, False]:
+        eps = 1e-3
+        np_a[abs(np_a) < eps] = 2 * eps
+        np_mean = np.mean(np_a, axis=test_axes, keepdims=keepdims)
+        np_var = np.var(np_a, axis=test_axes, keepdims=keepdims)
+        mx_mean, mx_var = mx.nd.moments(a, keepdims=keepdims, axes=test_axes)
+        N = np_a.size / np_mean.size
+        mx_sym = mx.sym.Variable("data")
+        mx_moments = mx.sym.moments(mx_sym, axes=test_axes, keepdims=keepdims)
+        mx_test_sym = mx.sym.elemwise_add(mx_moments[0], mx_moments[1])
+        if len(np_mean.shape) == 0:
+            np_mean = np_mean.reshape(mx_mean.shape)
+            np_var = np_var.reshape(mx_var.shape)
+        assert np_mean.shape == mx_mean.shape
+        assert np_var.shape == mx_var.shape
+        check_symbolic_forward(mx_test_sym, [np_a], [np_mean + np_var], rtol=1e-3, atol=1e-5)
+        check_numeric_gradient(mx_test_sym, [np_a], numeric_eps=eps, rtol=1e-2, atol=2e-4)
+
+
 @with_seed()
 def test_invalid_kernel_size():
     invalid_kernel_size = 28