From 2b1da703a16033d43b572a9dd84dae28d1f4eb8b Mon Sep 17 00:00:00 2001
From: Zhenghui Jin <69359374+barry-jin@users.noreply.github.com>
Date: Fri, 24 Sep 2021 14:05:35 -0700
Subject: [PATCH] [v2.0] Split Large Source Files (#20604)

* split np_kron

* refactor more

* fix sanity

* fix

* fix param

* fix np_where
---
 .../numpy/np_broadcast_reduce_op_value.cc     | 364 ------------------
 .../numpy/np_broadcast_reduce_op_value.cu     |  81 ----
 .../numpy/np_broadcast_reduce_op_value.h      | 193 ++++++++++
 ..._broadcast_reduce_op_value_broadcast_to.cc |  64 +++
 ..._broadcast_reduce_op_value_broadcast_to.cu |  37 ++
 .../numpy/np_broadcast_reduce_op_value_max.cc |  65 ++++
 .../numpy/np_broadcast_reduce_op_value_max.cu |  38 ++
 .../np_broadcast_reduce_op_value_mean.cc      |  63 +++
 .../np_broadcast_reduce_op_value_mean.cu      |  38 ++
 .../numpy/np_broadcast_reduce_op_value_min.cc |  65 ++++
 .../numpy/np_broadcast_reduce_op_value_min.cu |  38 ++
 .../np_broadcast_reduce_op_value_prod.cc      |  64 +++
 .../np_broadcast_reduce_op_value_prod.cu      |  38 ++
 .../numpy/np_broadcast_reduce_op_value_sum.cc |  67 ++++
 .../numpy/np_broadcast_reduce_op_value_sum.cu |  38 ++
 src/operator/numpy/np_dot_backward.cc         |  42 ++
 .../numpy/{np_dot.cu => np_dot_backward.cu}   |   4 +-
 .../numpy/{np_dot.cc => np_dot_forward.cc}    |  13 +-
 src/operator/numpy/np_dot_forward.cu          |  33 ++
 src/operator/numpy/np_kron_backward.cc        |  41 ++
 .../numpy/{np_kron.cu => np_kron_backward.cu} |   4 +-
 .../numpy/{np_kron.cc => np_kron_forward.cc}  |  12 +-
 src/operator/numpy/np_kron_forward.cu         |  33 ++
 src/operator/numpy/np_moments_op.cu           |  41 ++
 src/operator/numpy/np_where_backward_op.cc    |  59 +++
 src/operator/numpy/np_where_backward_op.cu    |  40 ++
 ...{np_where_op.cc => np_where_forward_op.cc} |  30 +-
 ...{np_where_op.cu => np_where_forward_op.cu} |  10 +-
 28 files changed, 1103 insertions(+), 512 deletions(-)
 delete mode 100644 src/operator/numpy/np_broadcast_reduce_op_value.cc
 delete mode 100644 src/operator/numpy/np_broadcast_reduce_op_value.cu
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value.h
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_broadcast_to.cc
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_broadcast_to.cu
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_max.cc
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_max.cu
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_mean.cc
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_mean.cu
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_min.cc
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_min.cu
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_prod.cc
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_prod.cu
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_sum.cc
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value_sum.cu
 create mode 100644 src/operator/numpy/np_dot_backward.cc
 rename src/operator/numpy/{np_dot.cu => np_dot_backward.cu} (90%)
 rename src/operator/numpy/{np_dot.cc => np_dot_forward.cc} (91%)
 create mode 100644 src/operator/numpy/np_dot_forward.cu
 create mode 100644 src/operator/numpy/np_kron_backward.cc
 rename src/operator/numpy/{np_kron.cu => np_kron_backward.cu} (91%)
 rename src/operator/numpy/{np_kron.cc => np_kron_forward.cc} (87%)
 create mode 100644 src/operator/numpy/np_kron_forward.cu
 create mode 100644 src/operator/numpy/np_moments_op.cu
 create mode 100644 src/operator/numpy/np_where_backward_op.cc
 create mode 100644 src/operator/numpy/np_where_backward_op.cu
 rename src/operator/numpy/{np_where_op.cc => np_where_forward_op.cc} (90%)
 rename src/operator/numpy/{np_where_op.cu => np_where_forward_op.cu} (79%)

diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
deleted file mode 100644
index 2cfc038b223f..000000000000
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file np_reduce_op_value.cc
- * \brief CPU Implementation of broadcast and reduce functions based on value.
- */
-
-/*
- * move some op to np_moments_op.cc to aovid win platform build error:
- * fatal error C1002: compiler is out of heap space in pass 2
- *
- * Do not add new op in this file.
- */
-
-#if MXNET_USE_TVM_OP
-#include "../tvmop/op_module.h"
-#endif  // MXNET_USE_TVM_OP
-
-#include "np_broadcast_reduce_op.h"
-
-namespace mxnet {
-namespace op {
-
-DMLC_REGISTER_PARAMETER(NumpyReduceAxesParam);
-DMLC_REGISTER_PARAMETER(NumpyReduceAxesNoDTypeParam);
-
-inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
-                         std::vector<int>* in_attrs,
-                         std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
-
-  if (param.dtype.has_value()) {
-    if (in_attrs->at(0) == mshadow::kBool) {
-      CHECK(param.dtype.value() == mshadow::kInt32 || param.dtype.value() == mshadow::kInt64 ||
-            param.dtype.value() == mshadow::kFloat32 || param.dtype.value() == mshadow::kFloat64)
-          << "Only support the following output dtypes when input dtype is bool: "
-             "int32, int64, float32, float64.";
-    }
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
-  } else if (in_attrs->at(0) == mshadow::kBool) {
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
-  } else {
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  }
-
-  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
-}
-
-#if MXNET_USE_TVM_OP
-static constexpr int max_reduce_ndim = 5;
-TBlob PrependAxes(const TBlob& src, const int dst_ndim);
-#endif  // MXNET_USE_TVM_OP
-
-void TVMOpReduce(const OpContext& ctx,
-                 const TBlob& input,
-                 const dmlc::optional<mxnet::Tuple<int>>& axis,
-                 const TBlob& output,
-                 const OpReqType req,
-                 const std::string& reducer_name) {
-#if MXNET_USE_TVM_OP
-  CHECK_GE(input.ndim(), output.ndim());
-  CHECK_LE(input.ndim(), max_reduce_ndim)
-      << "TVMOpReduce only supports ndim <= " << max_reduce_ndim;
-
-  const TBlob expanded_output =
-      (input.ndim() == output.ndim()
-           ? output
-           : output.reshape(NumpyReduceAxesShapeImpl(input.shape_, axis, true)));
-  CHECK_EQ(input.ndim(), expanded_output.ndim());
-  int reduce1st_dim = 0;
-  if (input.ndim() > 0 && input.size(0) != expanded_output.size(0)) {
-    reduce1st_dim = 1;
-  }
-  // collapse consecutive dimensions where reduction are performed or not performed
-  std::vector<index_t> ishape_vec;
-  for (int i = 0; i < input.ndim(); ++i) {
-    if (i == 0 || ((input.size(i) != expanded_output.size(i)) !=
-                   (input.size(i - 1) != expanded_output.size(i - 1)))) {
-      ishape_vec.push_back(input.size(i));
-    } else {
-      ishape_vec.back() *= input.size(i);
-    }
-  }
-  // append axes after collapsed ishape to reach the max ndim allowed
-  for (int i = ishape_vec.size(); i < max_reduce_ndim; ++i) {
-    ishape_vec.push_back(1);
-  }
-  std::vector<index_t> oshape_vec;
-  for (size_t i = reduce1st_dim; i < ishape_vec.size(); i += 2) {
-    oshape_vec.push_back(ishape_vec[i]);
-  }
-  TShape ishape(ishape_vec.begin(), ishape_vec.end()), oshape(oshape_vec.begin(), oshape_vec.end());
-  TBlob input_tvm  = input.reshape(ishape);
-  TBlob output_tvm = output.reshape(oshape);
-  const std::string ctx_name =
-      (ctx.run_ctx.ctx.dev_type == mxnet::Context::DeviceType::kCPU) ? "cpu" : "gpu";
-  std::ostringstream func_name;
-  func_name << reducer_name << "_"
-            << (ctx.run_ctx.ctx.dev_type == mxnet::Context::DeviceType::kCPU ? "cpu" : "gpu")
-            << "reduce1st_dim_" << reduce1st_dim << "req_"
-            << (req == kWriteTo ? "kWriteTo" : "kAddTo");
-  tvm::runtime::TVMOpModule::Get()->Call(func_name.str(), ctx, {input_tvm, output_tvm, output_tvm});
-#else
-  LOG(FATAL) << "Please add USE_TVM_OP=1 as a compile flag to enable TVM-generated kernels.";
-#endif  // MXNET_USE_TVM_OP
-}
-
-NNVM_REGISTER_OP(_npi_sum)
-    .describe(R"code()code" ADD_FILELINE)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
-    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
-    .set_attr<nnvm::FInferType>("FInferType", NumpySumType)
-    .set_attr<nnvm::FListInputNames>("FListInputNames",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<std::string>{"a"};
-                                     })
-    .add_argument("a", "NDArray-or-Symbol", "The input")
-    .add_arguments(NumpyReduceAxesParam::__FIELDS__())
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true>)
-    .set_attr<FResourceRequest>("FResourceRequest",
-                                [](const NodeAttrs& attrs) {
-                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-                                })
-    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_npi_sum"});
-
-NNVM_REGISTER_OP(_backward_npi_sum)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_num_inputs(1)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu>);
-
-inline bool NumpyReduceAxesNoDTypeType(const nnvm::NodeAttrs& attrs,
-                                       std::vector<int>* in_attrs,
-                                       std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-
-  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
-}
-
-NNVM_REGISTER_OP(_npi_max)
-    .add_alias("_npi_amax")
-    .describe(R"code()code" ADD_FILELINE)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesNoDTypeParam>)
-    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesNoDTypeShape)
-    .set_attr<nnvm::FInferType>("FInferType", NumpyReduceAxesNoDTypeType)
-    .set_attr<nnvm::FListInputNames>("FListInputNames",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<std::string>{"a"};
-                                     })
-    .add_argument("a", "NDArray-or-Symbol", "The input")
-    .add_arguments(NumpyReduceAxesNoDTypeParam::__FIELDS__())
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesNoDTypeCompute<cpu, mshadow::red::maximum>)
-    .set_attr<FResourceRequest>("FResourceRequest",
-                                [](const NodeAttrs& attrs) {
-                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-                                })
-    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-    .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_npi_max"});
-
-NNVM_REGISTER_OP(_backward_npi_max)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesNoDTypeParam>)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_num_inputs(3)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesNoDTypeBackward<cpu, mshadow_op::eq>);
-
-NNVM_REGISTER_OP(_npi_min)
-    .add_alias("_npi_amin")
-    .describe(R"code()code" ADD_FILELINE)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesNoDTypeParam>)
-    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesNoDTypeShape)
-    .set_attr<nnvm::FInferType>("FInferType", NumpyReduceAxesNoDTypeType)
-    .set_attr<nnvm::FListInputNames>("FListInputNames",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<std::string>{"a"};
-                                     })
-    .add_argument("a", "NDArray-or-Symbol", "The input")
-    .add_arguments(NumpyReduceAxesNoDTypeParam::__FIELDS__())
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesNoDTypeCompute<cpu, mshadow::red::minimum>)
-    .set_attr<FResourceRequest>("FResourceRequest",
-                                [](const NodeAttrs& attrs) {
-                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-                                })
-    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-    .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_npi_min"});
-
-NNVM_REGISTER_OP(_backward_npi_min)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesNoDTypeParam>)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_num_inputs(3)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesNoDTypeBackward<cpu, mshadow_op::eq>);
-
-NNVM_REGISTER_OP(_npi_prod)
-    .add_alias("_np_product")
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
-    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
-    .set_attr<nnvm::FInferType>("FInferType", NumpySumType)
-    .add_arguments(NumpyReduceAxesParam::__FIELDS__())
-    .set_attr<nnvm::FListInputNames>("FListInputNames",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<std::string>{"a"};
-                                     })
-    .add_argument("a", "NDArray-or-Symbol", "The input")
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::product, true>)
-    .set_attr<FResourceRequest>("FResourceRequest",
-                                [](const NodeAttrs& attrs) {
-                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-                                })
-    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-    .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_npi_prod"});
-
-NNVM_REGISTER_OP(_backward_npi_prod)
-    .set_num_inputs(3)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseInOut<cpu, mshadow_op::rdiv>);
-
-inline bool IsIntType(const int dtype) {
-  return (dtype == mshadow::kUint8 || dtype == mshadow::kInt32 || dtype == mshadow::kInt8 ||
-          dtype == mshadow::kInt64);
-}
-
-inline bool NumpyMeanType(const nnvm::NodeAttrs& attrs,
-                          std::vector<int>* in_attrs,
-                          std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
-
-  if (param.dtype.has_value()) {
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
-  } else {
-    if (common::is_float(in_attrs->at(0))) {
-      TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-      TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-    } else {
-      TYPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::common::GetDefaultDtype());
-    }
-  }
-
-  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
-}
-
-NNVM_REGISTER_OP(_npi_mean)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
-    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
-    .set_attr<nnvm::FInferType>("FInferType", NumpyMeanType)
-    .set_attr<nnvm::FListInputNames>("FListInputNames",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<std::string>{"a"};
-                                     })
-    .add_argument("a", "NDArray-or-Symbol", "The input")
-    .add_arguments(NumpyReduceAxesParam::__FIELDS__())
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true, true>)
-    .set_attr<FResourceRequest>("FResourceRequest",
-                                [](const NodeAttrs& attrs) {
-                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-                                })
-    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_mean"});
-
-NNVM_REGISTER_OP(_backward_np_mean)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_num_inputs(1)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu, true>);
-
-bool NumpyBroadcastToShape(const nnvm::NodeAttrs& attrs,
-                           mxnet::ShapeVector* in_attrs,
-                           mxnet::ShapeVector* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  mxnet::TShape& ishape = (*in_attrs)[0];
-  if (!mxnet::shape_is_known(ishape))
-    return false;
-  const BroadcastToParam& param = nnvm::get<BroadcastToParam>(attrs.parsed);
-  CHECK_LE(ishape.ndim(), param.shape.ndim())
-      << "shape " << ishape << " is not broadcastable to " << param.shape;
-  TShape pshape = param.shape;
-  for (int i = param.shape.ndim() - 1; i >= 0; --i) {
-    int j = i - param.shape.ndim() + ishape.ndim();
-    if (j < 0)
-      break;
-    if (pshape[i] == -2) {
-      pshape[i] = ishape[j];
-    }
-    CHECK(ishape[j] == pshape[i] || ishape[j] == 1)
-        << "shape " << ishape << " is not broadcastable to " << pshape;
-  }
-  CHECK(mxnet::shape_is_known(pshape))
-      << "the objective shape for broadcasting array must be known";
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, pshape);
-  return true;
-}
-
-NNVM_REGISTER_OP(_npi_broadcast_to)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr<nnvm::FListInputNames>("FListInputNames",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<std::string>{"array"};
-                                     })
-    .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-    .set_attr<nnvm::FGradient>("FGradient",
-                               [](const nnvm::ObjectPtr& n,
-                                  const std::vector<nnvm::NodeEntry>& ograds) {
-                                 return MakeNonlossGradNode(
-                                     "_backward_np_broadcast_to", n, ograds, {}, n->attrs.dict);
-                               })
-    .add_argument("array", "NDArray-or-Symbol", "The input")
-    .set_attr_parser(ParamParser<BroadcastToParam>)
-    .add_arguments(BroadcastToParam::__FIELDS__())
-    .set_attr<mxnet::FInferShape>("FInferShape", NumpyBroadcastToShape)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyBroadcastToForward<cpu>);
-
-NNVM_REGISTER_OP(_backward_np_broadcast_to)
-    .set_attr_parser(ParamParser<BroadcastToParam>)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyBroadcastToBackward<cpu>)
-    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
-      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-    });
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
deleted file mode 100644
index f0ce8f90956e..000000000000
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cu
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file np_reduce_op_value.cu
- * \brief GPU Implementation of reduce functions based on value.
- */
-#include "np_broadcast_reduce_op.h"
-
-namespace mxnet {
-namespace op {
-NNVM_REGISTER_OP(_npi_sum).set_attr<FCompute>(
-    "FCompute<gpu>",
-    ReduceAxesRTCCompute<NumpyReduceAxesParam, 0>{"identity", "red::sum{}", false});
-
-NNVM_REGISTER_OP(_backward_npi_sum)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu>);
-
-NNVM_REGISTER_OP(_npi_max).set_attr<FCompute>(
-    "FCompute<gpu>",
-    ReduceAxesRTCCompute<NumpyReduceAxesNoDTypeParam, 0>{"identity", "red::maximum{}", false});
-
-NNVM_REGISTER_OP(_backward_npi_max)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesNoDTypeBackward<gpu, mshadow_op::eq>);
-
-NNVM_REGISTER_OP(_npi_min).set_attr<FCompute>(
-    "FCompute<gpu>",
-    ReduceAxesRTCCompute<NumpyReduceAxesNoDTypeParam, 0>{"identity", "red::minimum{}", false});
-
-NNVM_REGISTER_OP(_backward_npi_min)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesNoDTypeBackward<gpu, mshadow_op::eq>);
-
-NNVM_REGISTER_OP(_npi_prod).set_attr<FCompute>(
-    "FCompute<gpu>",
-    ReduceAxesRTCCompute<NumpyReduceAxesParam, 1>{"identity", "red::product{}", false});
-
-NNVM_REGISTER_OP(_backward_npi_prod)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseInOut<gpu, mshadow_op::rdiv>);
-
-NNVM_REGISTER_OP(_npi_average)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyWeightedAverageForward<gpu>);
-
-NNVM_REGISTER_OP(_backward_np_average)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyWeightedAverageBackward<gpu>);
-
-NNVM_REGISTER_OP(_npi_mean).set_attr<FCompute>(
-    "FCompute<gpu>",
-    ReduceAxesRTCCompute<NumpyReduceAxesParam, 0>{"identity", "red::sum{}", true});
-
-NNVM_REGISTER_OP(_backward_np_mean)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu, true>);
-
-NNVM_REGISTER_OP(_npi_std).set_attr<FCompute>("FCompute<gpu>", NumpyMomentsForward<gpu, true>);
-
-NNVM_REGISTER_OP(_npi_var).set_attr<FCompute>("FCompute<gpu>", NumpyMomentsForward<gpu, false>);
-
-NNVM_REGISTER_OP(_npi_broadcast_to)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyBroadcastToForward<gpu>);
-
-NNVM_REGISTER_OP(_backward_np_broadcast_to)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyBroadcastToBackward<gpu>);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.h b/src/operator/numpy/np_broadcast_reduce_op_value.h
new file mode 100644
index 000000000000..2cdd77f2deb3
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.h
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value.h
+ * \brief Definition of broadcast and reduce functions based on value.
+ */
+
+#ifndef MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_VALUE_H_
+#define MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_VALUE_H_
+
+#include <string>
+#include <vector>
+
+#if MXNET_USE_TVM_OP
+#include "../tvmop/op_module.h"
+#endif  // MXNET_USE_TVM_OP
+
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
+                         std::vector<int>* in_attrs,
+                         std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
+
+  if (param.dtype.has_value()) {
+    if (in_attrs->at(0) == mshadow::kBool) {
+      CHECK(param.dtype.value() == mshadow::kInt32 || param.dtype.value() == mshadow::kInt64 ||
+            param.dtype.value() == mshadow::kFloat32 || param.dtype.value() == mshadow::kFloat64)
+          << "Only support the following output dtypes when input dtype is bool: "
+             "int32, int64, float32, float64.";
+    }
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
+  } else if (in_attrs->at(0) == mshadow::kBool) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  }
+
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+#if MXNET_USE_TVM_OP
+static constexpr int max_reduce_ndim = 5;
+TBlob PrependAxes(const TBlob& src, const int dst_ndim);
+#endif  // MXNET_USE_TVM_OP
+
+inline void TVMOpReduce(const OpContext& ctx,
+                        const TBlob& input,
+                        const dmlc::optional<mxnet::Tuple<int>>& axis,
+                        const TBlob& output,
+                        const OpReqType req,
+                        const std::string& reducer_name) {
+#if MXNET_USE_TVM_OP
+  CHECK_GE(input.ndim(), output.ndim());
+  CHECK_LE(input.ndim(), max_reduce_ndim)
+      << "TVMOpReduce only supports ndim <= " << max_reduce_ndim;
+
+  const TBlob expanded_output =
+      (input.ndim() == output.ndim()
+           ? output
+           : output.reshape(NumpyReduceAxesShapeImpl(input.shape_, axis, true)));
+  CHECK_EQ(input.ndim(), expanded_output.ndim());
+  int reduce1st_dim = 0;
+  if (input.ndim() > 0 && input.size(0) != expanded_output.size(0)) {
+    reduce1st_dim = 1;
+  }
+  // collapse consecutive dimensions where reduction are performed or not performed
+  std::vector<index_t> ishape_vec;
+  for (int i = 0; i < input.ndim(); ++i) {
+    if (i == 0 || ((input.size(i) != expanded_output.size(i)) !=
+                   (input.size(i - 1) != expanded_output.size(i - 1)))) {
+      ishape_vec.push_back(input.size(i));
+    } else {
+      ishape_vec.back() *= input.size(i);
+    }
+  }
+  // append axes after collapsed ishape to reach the max ndim allowed
+  for (int i = ishape_vec.size(); i < max_reduce_ndim; ++i) {
+    ishape_vec.push_back(1);
+  }
+  std::vector<index_t> oshape_vec;
+  for (size_t i = reduce1st_dim; i < ishape_vec.size(); i += 2) {
+    oshape_vec.push_back(ishape_vec[i]);
+  }
+  TShape ishape(ishape_vec.begin(), ishape_vec.end()), oshape(oshape_vec.begin(), oshape_vec.end());
+  TBlob input_tvm  = input.reshape(ishape);
+  TBlob output_tvm = output.reshape(oshape);
+  const std::string ctx_name =
+      (ctx.run_ctx.ctx.dev_type == mxnet::Context::DeviceType::kCPU) ? "cpu" : "gpu";
+  std::ostringstream func_name;
+  func_name << reducer_name << "_"
+            << (ctx.run_ctx.ctx.dev_type == mxnet::Context::DeviceType::kCPU ? "cpu" : "gpu")
+            << "reduce1st_dim_" << reduce1st_dim << "req_"
+            << (req == kWriteTo ? "kWriteTo" : "kAddTo");
+  tvm::runtime::TVMOpModule::Get()->Call(func_name.str(), ctx, {input_tvm, output_tvm, output_tvm});
+#else
+  LOG(FATAL) << "Please add USE_TVM_OP=1 as a compile flag to enable TVM-generated kernels.";
+#endif  // MXNET_USE_TVM_OP
+}
+
+inline bool NumpyReduceAxesNoDTypeType(const nnvm::NodeAttrs& attrs,
+                                       std::vector<int>* in_attrs,
+                                       std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+inline bool IsIntType(const int dtype) {
+  return (dtype == mshadow::kUint8 || dtype == mshadow::kInt32 || dtype == mshadow::kInt8 ||
+          dtype == mshadow::kInt64);
+}
+
+inline bool NumpyMeanType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
+
+  if (param.dtype.has_value()) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
+  } else {
+    if (common::is_float(in_attrs->at(0))) {
+      TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+      TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+    } else {
+      TYPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::common::GetDefaultDtype());
+    }
+  }
+
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+inline bool NumpyBroadcastToShape(const nnvm::NodeAttrs& attrs,
+                                  mxnet::ShapeVector* in_attrs,
+                                  mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  mxnet::TShape& ishape = (*in_attrs)[0];
+  if (!mxnet::shape_is_known(ishape))
+    return false;
+  const BroadcastToParam& param = nnvm::get<BroadcastToParam>(attrs.parsed);
+  CHECK_LE(ishape.ndim(), param.shape.ndim())
+      << "shape " << ishape << " is not broadcastable to " << param.shape;
+  TShape pshape = param.shape;
+  for (int i = param.shape.ndim() - 1; i >= 0; --i) {
+    int j = i - param.shape.ndim() + ishape.ndim();
+    if (j < 0)
+      break;
+    if (pshape[i] == -2) {
+      pshape[i] = ishape[j];
+    }
+    CHECK(ishape[j] == pshape[i] || ishape[j] == 1)
+        << "shape " << ishape << " is not broadcastable to " << pshape;
+  }
+  CHECK(mxnet::shape_is_known(pshape))
+      << "the objective shape for broadcasting array must be known";
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, pshape);
+  return true;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_VALUE_H_
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_broadcast_to.cc b/src/operator/numpy/np_broadcast_reduce_op_value_broadcast_to.cc
new file mode 100644
index 000000000000..e6de07948626
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_broadcast_to.cc
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_broadcast_to.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on value.
+ */
+
+#if MXNET_USE_TVM_OP
+#include "../tvmop/op_module.h"
+#endif  // MXNET_USE_TVM_OP
+
+#include "np_broadcast_reduce_op_value.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_broadcast_to)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<std::string>{"array"};
+                                     })
+    .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+    .set_attr<nnvm::FGradient>("FGradient",
+                               [](const nnvm::ObjectPtr& n,
+                                  const std::vector<nnvm::NodeEntry>& ograds) {
+                                 return MakeNonlossGradNode(
+                                     "_backward_np_broadcast_to", n, ograds, {}, n->attrs.dict);
+                               })
+    .add_argument("array", "NDArray-or-Symbol", "The input")
+    .set_attr_parser(ParamParser<BroadcastToParam>)
+    .add_arguments(BroadcastToParam::__FIELDS__())
+    .set_attr<mxnet::FInferShape>("FInferShape", NumpyBroadcastToShape)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyBroadcastToForward<cpu>);
+
+NNVM_REGISTER_OP(_backward_np_broadcast_to)
+    .set_attr_parser(ParamParser<BroadcastToParam>)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyBroadcastToBackward<cpu>)
+    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+    });
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_broadcast_to.cu b/src/operator/numpy/np_broadcast_reduce_op_value_broadcast_to.cu
new file mode 100644
index 000000000000..d58645e75e4c
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_broadcast_to.cu
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_broadcast_to.cu
+ * \brief GPU Implementation of reduce functions based on value.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_broadcast_to)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyBroadcastToForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_broadcast_to)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyBroadcastToBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_max.cc b/src/operator/numpy/np_broadcast_reduce_op_value_max.cc
new file mode 100644
index 000000000000..81816722abc1
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_max.cc
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_max.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on value.
+ */
+
+#if MXNET_USE_TVM_OP
+#include "../tvmop/op_module.h"
+#endif  // MXNET_USE_TVM_OP
+
+#include "np_broadcast_reduce_op_value.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_max)
+    .add_alias("_npi_amax")
+    .describe(R"code()code" ADD_FILELINE)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesNoDTypeParam>)
+    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesNoDTypeShape)
+    .set_attr<nnvm::FInferType>("FInferType", NumpyReduceAxesNoDTypeType)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<std::string>{"a"};
+                                     })
+    .add_argument("a", "NDArray-or-Symbol", "The input")
+    .add_arguments(NumpyReduceAxesNoDTypeParam::__FIELDS__())
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesNoDTypeCompute<cpu, mshadow::red::maximum>)
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+    .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_npi_max"});
+
+NNVM_REGISTER_OP(_backward_npi_max)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesNoDTypeParam>)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_num_inputs(3)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesNoDTypeBackward<cpu, mshadow_op::eq>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_max.cu b/src/operator/numpy/np_broadcast_reduce_op_value_max.cu
new file mode 100644
index 000000000000..522af645367e
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_max.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_max.cu
+ * \brief GPU Implementation of reduce functions based on value.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_max).set_attr<FCompute>(
+    "FCompute<gpu>",
+    ReduceAxesRTCCompute<NumpyReduceAxesNoDTypeParam, 0>{"identity", "red::maximum{}", false});
+
+NNVM_REGISTER_OP(_backward_npi_max)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesNoDTypeBackward<gpu, mshadow_op::eq>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_mean.cc b/src/operator/numpy/np_broadcast_reduce_op_value_mean.cc
new file mode 100644
index 000000000000..094b65627e99
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_mean.cc
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_mean.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on value.
+ */
+
+#if MXNET_USE_TVM_OP
+#include "../tvmop/op_module.h"
+#endif  // MXNET_USE_TVM_OP
+
+#include "np_broadcast_reduce_op_value.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_mean)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
+    .set_attr<nnvm::FInferType>("FInferType", NumpyMeanType)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<std::string>{"a"};
+                                     })
+    .add_argument("a", "NDArray-or-Symbol", "The input")
+    .add_arguments(NumpyReduceAxesParam::__FIELDS__())
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true, true>)
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_mean"});
+
+NNVM_REGISTER_OP(_backward_np_mean)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_num_inputs(1)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu, true>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_mean.cu b/src/operator/numpy/np_broadcast_reduce_op_value_mean.cu
new file mode 100644
index 000000000000..554b281f2462
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_mean.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_mean.cu
+ * \brief GPU Implementation of reduce functions based on value.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_mean).set_attr<FCompute>(
+    "FCompute<gpu>",
+    ReduceAxesRTCCompute<NumpyReduceAxesParam, 0>{"identity", "red::sum{}", true});
+
+NNVM_REGISTER_OP(_backward_np_mean)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu, true>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_min.cc b/src/operator/numpy/np_broadcast_reduce_op_value_min.cc
new file mode 100644
index 000000000000..cdf8625156e4
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_min.cc
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_min.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on value.
+ */
+
+#if MXNET_USE_TVM_OP
+#include "../tvmop/op_module.h"
+#endif  // MXNET_USE_TVM_OP
+
+#include "np_broadcast_reduce_op_value.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_min)
+    .add_alias("_npi_amin")
+    .describe(R"code()code" ADD_FILELINE)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesNoDTypeParam>)
+    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesNoDTypeShape)
+    .set_attr<nnvm::FInferType>("FInferType", NumpyReduceAxesNoDTypeType)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<std::string>{"a"};
+                                     })
+    .add_argument("a", "NDArray-or-Symbol", "The input")
+    .add_arguments(NumpyReduceAxesNoDTypeParam::__FIELDS__())
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesNoDTypeCompute<cpu, mshadow::red::minimum>)
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+    .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_npi_min"});
+
+NNVM_REGISTER_OP(_backward_npi_min)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesNoDTypeParam>)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_num_inputs(3)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesNoDTypeBackward<cpu, mshadow_op::eq>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_min.cu b/src/operator/numpy/np_broadcast_reduce_op_value_min.cu
new file mode 100644
index 000000000000..15bdab73cc2f
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_min.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_min.cu
+ * \brief GPU Implementation of reduce functions based on value.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_min).set_attr<FCompute>(
+    "FCompute<gpu>",
+    ReduceAxesRTCCompute<NumpyReduceAxesNoDTypeParam, 0>{"identity", "red::minimum{}", false});
+
+NNVM_REGISTER_OP(_backward_npi_min)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesNoDTypeBackward<gpu, mshadow_op::eq>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_prod.cc b/src/operator/numpy/np_broadcast_reduce_op_value_prod.cc
new file mode 100644
index 000000000000..9bc6f832ee17
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_prod.cc
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_prod.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on value.
+ */
+
+#if MXNET_USE_TVM_OP
+#include "../tvmop/op_module.h"
+#endif  // MXNET_USE_TVM_OP
+
+#include "np_broadcast_reduce_op_value.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_prod)
+    .add_alias("_np_product")
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
+    .set_attr<nnvm::FInferType>("FInferType", NumpySumType)
+    .add_arguments(NumpyReduceAxesParam::__FIELDS__())
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<std::string>{"a"};
+                                     })
+    .add_argument("a", "NDArray-or-Symbol", "The input")
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::product, true>)
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+    .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_npi_prod"});
+
+NNVM_REGISTER_OP(_backward_npi_prod)
+    .set_num_inputs(3)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseInOut<cpu, mshadow_op::rdiv>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_prod.cu b/src/operator/numpy/np_broadcast_reduce_op_value_prod.cu
new file mode 100644
index 000000000000..1686fd3780f9
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_prod.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_prod.cu
+ * \brief GPU Implementation of reduce functions based on value.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_prod).set_attr<FCompute>(
+    "FCompute<gpu>",
+    ReduceAxesRTCCompute<NumpyReduceAxesParam, 1>{"identity", "red::product{}", false});
+
+NNVM_REGISTER_OP(_backward_npi_prod)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseInOut<gpu, mshadow_op::rdiv>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_sum.cc b/src/operator/numpy/np_broadcast_reduce_op_value_sum.cc
new file mode 100644
index 000000000000..084a4b8acfec
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_sum.cc
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_sum.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on value.
+ */
+
+#if MXNET_USE_TVM_OP
+#include "../tvmop/op_module.h"
+#endif  // MXNET_USE_TVM_OP
+
+#include "np_broadcast_reduce_op_value.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(NumpyReduceAxesParam);
+DMLC_REGISTER_PARAMETER(NumpyReduceAxesNoDTypeParam);
+
+NNVM_REGISTER_OP(_npi_sum)
+    .describe(R"code()code" ADD_FILELINE)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+    .set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
+    .set_attr<nnvm::FInferType>("FInferType", NumpySumType)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<std::string>{"a"};
+                                     })
+    .add_argument("a", "NDArray-or-Symbol", "The input")
+    .add_arguments(NumpyReduceAxesParam::__FIELDS__())
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true>)
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_npi_sum"});
+
+NNVM_REGISTER_OP(_backward_npi_sum)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_num_inputs(1)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value_sum.cu b/src/operator/numpy/np_broadcast_reduce_op_value_sum.cu
new file mode 100644
index 000000000000..a53f59763543
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value_sum.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_value_sum.cu
+ * \brief GPU Implementation of reduce functions based on value.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_sum).set_attr<FCompute>(
+    "FCompute<gpu>",
+    ReduceAxesRTCCompute<NumpyReduceAxesParam, 0>{"identity", "red::sum{}", false});
+
+NNVM_REGISTER_OP(_backward_npi_sum)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_dot_backward.cc b/src/operator/numpy/np_dot_backward.cc
new file mode 100644
index 000000000000..976f1de7f778
--- /dev/null
+++ b/src/operator/numpy/np_dot_backward.cc
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_dot_backward.cc
+ * \brief CPU Implementation of numpy-compatible dot
+ */
+
+#include "./np_dot-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_backward_npi_dot)
+    .set_num_inputs(3)
+    .set_num_outputs(2)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>(1,
+                                                                      ResourceRequest::kTempSpace);
+                                })
+    .set_attr<FCompute>("FCompute<cpu>", NumpyDotBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_dot.cu b/src/operator/numpy/np_dot_backward.cu
similarity index 90%
rename from src/operator/numpy/np_dot.cu
rename to src/operator/numpy/np_dot_backward.cu
index 290bdd31ec90..3b3580956993 100644
--- a/src/operator/numpy/np_dot.cu
+++ b/src/operator/numpy/np_dot_backward.cu
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file np_dot.cu
+ * \file np_dot_backward.cu
  * \brief GPU Implementation of numpy-compatible dot
  */
 
@@ -27,8 +27,6 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_dot).set_attr<FCompute>("FCompute<gpu>", NumpyDotForward<gpu>);
-
 NNVM_REGISTER_OP(_backward_npi_dot).set_attr<FCompute>("FCompute<gpu>", NumpyDotBackward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_dot.cc b/src/operator/numpy/np_dot_forward.cc
similarity index 91%
rename from src/operator/numpy/np_dot.cc
rename to src/operator/numpy/np_dot_forward.cc
index 66daf4ba6981..1c2da2d471a7 100644
--- a/src/operator/numpy/np_dot.cc
+++ b/src/operator/numpy/np_dot_forward.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file np_dot.cc
+ * \file np_dot_forward.cc
  * \brief CPU Implementation of numpy-compatible dot
  */
 
@@ -138,16 +138,5 @@ NNVM_REGISTER_OP(_npi_dot)
     .add_argument("a", "NDArray-or-Symbol", "First input")
     .add_argument("b", "NDArray-or-Symbol", "Second input");
 
-NNVM_REGISTER_OP(_backward_npi_dot)
-    .set_num_inputs(3)
-    .set_num_outputs(2)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_attr<FResourceRequest>("FResourceRequest",
-                                [](const NodeAttrs& attrs) {
-                                  return std::vector<ResourceRequest>(1,
-                                                                      ResourceRequest::kTempSpace);
-                                })
-    .set_attr<FCompute>("FCompute<cpu>", NumpyDotBackward<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_dot_forward.cu b/src/operator/numpy/np_dot_forward.cu
new file mode 100644
index 000000000000..0986f40c5fea
--- /dev/null
+++ b/src/operator/numpy/np_dot_forward.cu
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_dot_forward.cu
+ * \brief GPU Implementation of numpy-compatible dot
+ */
+
+#include "./np_dot-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_dot).set_attr<FCompute>("FCompute<gpu>", NumpyDotForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_kron_backward.cc b/src/operator/numpy/np_kron_backward.cc
new file mode 100644
index 000000000000..cda460626dc3
--- /dev/null
+++ b/src/operator/numpy/np_kron_backward.cc
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_kron_backward.cc
+ * \brief CPU Implementation of numpy-compatible Kronecker product
+ */
+
+#include "./np_kron-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_backward_npi_kron)
+    .set_num_inputs(3)
+    .set_num_outputs(2)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<FCompute>("FCompute<cpu>", KronOpBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_kron.cu b/src/operator/numpy/np_kron_backward.cu
similarity index 91%
rename from src/operator/numpy/np_kron.cu
rename to src/operator/numpy/np_kron_backward.cu
index 4ec8b1a43cd5..2f3e9abd5cca 100644
--- a/src/operator/numpy/np_kron.cu
+++ b/src/operator/numpy/np_kron_backward.cu
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file np_kron.cu
+ * \file np_kron_backward.cu
  * \brief GPU Implementation of numpy-compatible Kronecker product
  */
 
@@ -27,8 +27,6 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_kron).set_attr<FCompute>("FCompute<gpu>", KronOpForward<gpu>);
-
 NNVM_REGISTER_OP(_backward_npi_kron).set_attr<FCompute>("FCompute<gpu>", KronOpBackward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_kron.cc b/src/operator/numpy/np_kron_forward.cc
similarity index 87%
rename from src/operator/numpy/np_kron.cc
rename to src/operator/numpy/np_kron_forward.cc
index 19cf049ad083..240586a9c768 100644
--- a/src/operator/numpy/np_kron.cc
+++ b/src/operator/numpy/np_kron_forward.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file np_kron.cc
+ * \file np_kron_forward.cc
  * \brief CPU Implementation of numpy-compatible Kronecker product
  */
 
@@ -80,15 +80,5 @@ NNVM_REGISTER_OP(_npi_kron)
     .add_argument("a", "NDArray-or-Symbol", "First input")
     .add_argument("b", "NDArray-or-Symbol", "Second input");
 
-NNVM_REGISTER_OP(_backward_npi_kron)
-    .set_num_inputs(3)
-    .set_num_outputs(2)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_attr<FResourceRequest>("FResourceRequest",
-                                [](const NodeAttrs& attrs) {
-                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-                                })
-    .set_attr<FCompute>("FCompute<cpu>", KronOpBackward<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_kron_forward.cu b/src/operator/numpy/np_kron_forward.cu
new file mode 100644
index 000000000000..ee7fdd629a08
--- /dev/null
+++ b/src/operator/numpy/np_kron_forward.cu
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_kron_forward.cu
+ * \brief GPU Implementation of numpy-compatible Kronecker product
+ */
+
+#include "./np_kron-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_kron).set_attr<FCompute>("FCompute<gpu>", KronOpForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_moments_op.cu b/src/operator/numpy/np_moments_op.cu
new file mode 100644
index 000000000000..16d8d6234187
--- /dev/null
+++ b/src/operator/numpy/np_moments_op.cu
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_moments_op.cu
+ * \brief GPU Implementation of reduce functions based on value.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_average)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyWeightedAverageForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_average)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyWeightedAverageBackward<gpu>);
+
+NNVM_REGISTER_OP(_npi_std).set_attr<FCompute>("FCompute<gpu>", NumpyMomentsForward<gpu, true>);
+
+NNVM_REGISTER_OP(_npi_var).set_attr<FCompute>("FCompute<gpu>", NumpyMomentsForward<gpu, false>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_where_backward_op.cc b/src/operator/numpy/np_where_backward_op.cc
new file mode 100644
index 000000000000..d55680f1a169
--- /dev/null
+++ b/src/operator/numpy/np_where_backward_op.cc
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file np_where_backward_op.cc
+ * \brief CPU Implementation of numpy operator where
+ */
+
+#include "np_where_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_backward_np_where)
+    .set_num_inputs(2)
+    .set_num_outputs(2)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyWhereOpBackward<cpu>)
+    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+    });
+
+NNVM_REGISTER_OP(_backward_np_where_lscalar)
+    .set_num_inputs(2)
+    .set_num_outputs(1)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyWhereScalarOpBackward<cpu, true>)
+    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+    });
+
+NNVM_REGISTER_OP(_backward_np_where_rscalar)
+    .set_num_inputs(2)
+    .set_num_outputs(1)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyWhereScalarOpBackward<cpu, false>)
+    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+    });
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_where_backward_op.cu b/src/operator/numpy/np_where_backward_op.cu
new file mode 100644
index 000000000000..612c93090386
--- /dev/null
+++ b/src/operator/numpy/np_where_backward_op.cu
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file np_where_backward_op.cu
+ * \brief GPU Implementation of numpy operator where
+ */
+
+#include "np_where_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_backward_np_where).set_attr<FCompute>("FCompute<gpu>", NumpyWhereOpBackward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_where_lscalar)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyWhereScalarOpBackward<gpu, true>);
+
+NNVM_REGISTER_OP(_backward_np_where_rscalar)
+    .set_attr<FCompute>("FCompute<gpu>", NumpyWhereScalarOpBackward<gpu, false>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_where_op.cc b/src/operator/numpy/np_where_forward_op.cc
similarity index 90%
rename from src/operator/numpy/np_where_op.cc
rename to src/operator/numpy/np_where_forward_op.cc
index 54dcb89e9efa..f999fecb8332 100644
--- a/src/operator/numpy/np_where_op.cc
+++ b/src/operator/numpy/np_where_forward_op.cc
@@ -19,12 +19,11 @@
 
 /*!
  * Copyright (c) 2017 by Contributors
- * \file np_where_op.cc
+ * \file np_where_forward_op.cc
  * \brief CPU Implementation of numpy operator where
  */
 
 #include "np_where_op-inl.h"
-#include "../tensor/elemwise_binary_broadcast_op.h"
 
 namespace mxnet {
 namespace op {
@@ -137,15 +136,6 @@ NNVM_REGISTER_OP(_npi_where)
     .add_argument("x", "NDArray-or-Symbol", "input x")
     .add_argument("y", "NDArray-or-Symbol", "input y");
 
-NNVM_REGISTER_OP(_backward_np_where)
-    .set_num_inputs(2)
-    .set_num_outputs(2)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyWhereOpBackward<cpu>)
-    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
-      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-    });
-
 NNVM_REGISTER_OP(_npi_where_lscalar)
     .set_num_inputs(2)
     .set_num_outputs(1)
@@ -238,24 +228,6 @@ NNVM_REGISTER_OP(_npi_where_rscalar)
     .add_argument("y", "NDArray-or-Symbol", "input y")
     .add_arguments(NumpyWhereScalarParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_backward_np_where_lscalar)
-    .set_num_inputs(2)
-    .set_num_outputs(1)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyWhereScalarOpBackward<cpu, true>)
-    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
-      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-    });
-
-NNVM_REGISTER_OP(_backward_np_where_rscalar)
-    .set_num_inputs(2)
-    .set_num_outputs(1)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyWhereScalarOpBackward<cpu, false>)
-    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
-      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-    });
-
 NNVM_REGISTER_OP(_npi_where_scalar2)
     .set_num_inputs(1)
     .set_num_outputs(1)
diff --git a/src/operator/numpy/np_where_op.cu b/src/operator/numpy/np_where_forward_op.cu
similarity index 79%
rename from src/operator/numpy/np_where_op.cu
rename to src/operator/numpy/np_where_forward_op.cu
index a1477c416f12..acd7edc1f4f8 100644
--- a/src/operator/numpy/np_where_op.cu
+++ b/src/operator/numpy/np_where_forward_op.cu
@@ -19,7 +19,7 @@
 
 /*!
  * Copyright (c) 2017 by Contributors
- * \file np_where_op.cu
+ * \file np_where_forward_op.cu
  * \brief GPU Implementation of numpy operator where
  */
 
@@ -30,20 +30,12 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_where).set_attr<FCompute>("FCompute<gpu>", NumpyWhereOpForward<gpu>);
 
-NNVM_REGISTER_OP(_backward_np_where).set_attr<FCompute>("FCompute<gpu>", NumpyWhereOpBackward<gpu>);
-
 NNVM_REGISTER_OP(_npi_where_lscalar)
     .set_attr<FCompute>("FCompute<gpu>", NumpyWhereScalarOpForward<gpu, true>);
 
 NNVM_REGISTER_OP(_npi_where_rscalar)
     .set_attr<FCompute>("FCompute<gpu>", NumpyWhereScalarOpForward<gpu, false>);
 
-NNVM_REGISTER_OP(_backward_np_where_lscalar)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyWhereScalarOpBackward<gpu, true>);
-
-NNVM_REGISTER_OP(_backward_np_where_rscalar)
-    .set_attr<FCompute>("FCompute<gpu>", NumpyWhereScalarOpBackward<gpu, false>);
-
 NNVM_REGISTER_OP(_npi_where_scalar2)
     .set_attr<FCompute>("FCompute<gpu>", NumpyWhereScalar2OpForward<gpu>);