diff --git a/src/operator/tensor/broadcast_reduce_minmax_value.cc b/src/operator/tensor/broadcast_reduce_minmax_value.cc
new file mode 100644
index 000000000000..f8bc33ba375d
--- /dev/null
+++ b/src/operator/tensor/broadcast_reduce_minmax_value.cc
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file broadcast_reduce_minmax_value.cc
+ * \brief CPU Implementation of broadcast and reduce min and max functions based on value.
+ */
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+MXNET_OPERATOR_REGISTER_MINMAX_REDUCE(max)
+.add_alias("max_axis")
+.describe(get_reduce_axes_description("max", __LINE__))
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::maximum>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_max"});
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_max)
+.set_num_inputs(3)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::eq>);
+
+MXNET_OPERATOR_REGISTER_MINMAX_REDUCE(min)
+.add_alias("min_axis")
+.describe(get_reduce_axes_description("min", __LINE__))
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::minimum>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_min"});
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_min)
+.set_num_inputs(3)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::eq>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_minmax_value.cu b/src/operator/tensor/broadcast_reduce_minmax_value.cu
new file mode 100644
index 000000000000..baf79feb5c60
--- /dev/null
+++ b/src/operator/tensor/broadcast_reduce_minmax_value.cu
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file broadcast_reduce_minmax_value.cu
+ * \brief GPU Implementation of broadcast and reduce min and max functions based on value.
+ */
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(max)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::maximum>);
+
+NNVM_REGISTER_OP(_backward_max)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::eq>);
+
+NNVM_REGISTER_OP(min)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::minimum>);
+
+NNVM_REGISTER_OP(_backward_min)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::eq>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cc b/src/operator/tensor/broadcast_reduce_norm_value.cc
new file mode 100644
index 000000000000..63a05b4980fc
--- /dev/null
+++ b/src/operator/tensor/broadcast_reduce_norm_value.cc
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file broadcast_reduce_norm_value.cc
+ * \brief CPU Implementation of broadcast and reduce norm functions based on value.
+ */
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(NormParam);
+
+template<>
+void L2NormComputeEx<cpu>(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  const NDArrayStorageType istype = inputs[0].storage_type();
+  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+  if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
+       param.ord == 2) {
+    // l2 norm on the entire array
+    L2NormComputeSparseImpl<cpu>(s, inputs[0], req[0], outputs[0].data());
+  } else if (istype == kCSRStorage && axis.ndim() == 1 && (axis[0] == 0 || axis[0] == 1) &&
+             !param.keepdims && param.ord == 2) {
+    // l2 norm on a particular axis
+    NDArray output = outputs[0];
+    ReduceCsrImpl<cpu, sq_sum, false>(s, ctx, inputs[0], req[0], &output, axis);
+    CHECK_EQ(outputs[0].storage_type(), kDefaultStorage);
+    SqRootForL2<cpu>(ctx, req[0], outputs[0].data());
+  } else {
+    LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
+  }
+}
+
+NNVM_REGISTER_OP(norm)
+MXNET_ADD_SPARSE_OP_ALIAS(norm)
+.describe(R"code(Computes the norm on an NDArray.
+
+This operator computes the norm on an NDArray with the specified axis, depending
+on the value of the ord parameter. By default, it computes the L2 norm on the entire
+array. Currently only ord=2 supports sparse ndarrays.
+
+Examples::
+
+  x = [[[1, 2],
+        [3, 4]],
+       [[2, 2],
+        [5, 6]]]
+
+  norm(x, ord=2, axis=1) = [[3.1622777 4.472136 ]
+                            [5.3851647 6.3245554]]
+
+  norm(x, ord=1, axis=1) = [[4., 6.],
+                            [7., 8.]]
+
+  rsp = x.cast_storage('row_sparse')
+
+  norm(rsp) = [5.47722578]
+
+  csr = x.cast_storage('csr')
+
+  norm(csr) = [5.47722578]
+
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NormParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NormShape)
+.set_attr<nnvm::FInferType>("FInferType", NormType)
+.set_attr<FInferStorageType>("FInferStorageType", LpNormStorageType)
+.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_norm" })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", LpNormCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", L2NormComputeEx<cpu>)
+.add_argument("data", "NDArray-or-Symbol", "The input")
+.add_arguments(NormParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_norm)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NormParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", LpNormGradCompute<cpu>);
+
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cu b/src/operator/tensor/broadcast_reduce_norm_value.cu
new file mode 100644
index 000000000000..188c93e61221
--- /dev/null
+++ b/src/operator/tensor/broadcast_reduce_norm_value.cu
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file broadcast_reduce_norm_value.cu
+ * \brief GPU Implementation of broadcast and reduce norm functions based on value.
+ */
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+void L2NormComputeEx<gpu>(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  const NDArrayStorageType istype = inputs[0].storage_type();
+  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+  if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
+       param.ord == 2) {
+    // l2 norm on the entire array
+    L2NormComputeSparseImpl<gpu>(s, inputs[0], req[0], outputs[0].data());
+  } else {
+    LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
+  }
+}
+
+NNVM_REGISTER_OP(norm)
+.set_attr<FCompute>("FCompute<gpu>", LpNormCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", L2NormComputeEx<gpu>);
+
+NNVM_REGISTER_OP(_backward_norm)
+.set_attr<FCompute>("FCompute<gpu>", LpNormGradCompute<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 27edf0195a1c..2e40b87a4b6d 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -26,6 +26,7 @@
 #define MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_OP_H_
 
 #include <mxnet/operator_util.h>
+#include <string>
 #include <vector>
 #include <utility>
 #include <algorithm>
@@ -1580,6 +1581,20 @@ void PickOpBackward(const nnvm::NodeAttrs& attrs,
   });
 }
 
+inline std::string get_reduce_axes_description(const std::string& op_name, int line) {
+  std::string doc = R"code(Computes the __op__ of array elements over given axes.
+
+Defined in )code";
+  doc += std::string(__FILE__) + std::string(":L") + std::to_string(line);
+  size_t pos = 0;
+  std::string holder("__op__");
+  while ((pos = doc.find(holder, pos)) != std::string::npos) {
+    doc.replace(pos, holder.length(), op_name);
+    pos += op_name.length();
+  }
+  return doc;
+}
+
 #define MXNET_OPERATOR_REGISTER_REDUCE_AXIS(name)               \
   NNVM_REGISTER_OP(name)                                        \
   .set_num_inputs(1)                                            \
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index 7e8a7f05659b..43bfc729329a 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -27,193 +27,11 @@
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(ReduceAxesParam);
-DMLC_REGISTER_PARAMETER(NormParam);
 DMLC_REGISTER_PARAMETER(ReduceAxisParam);
 DMLC_REGISTER_PARAMETER(BroadcastAxesParam);
 DMLC_REGISTER_PARAMETER(BroadcastToParam);
 DMLC_REGISTER_PARAMETER(BroadcastLikeParam);
 
-inline std::string get_reduce_axes_description(const std::string& op_name, int line) {
-  std::string doc = R"code(Computes the __op__ of array elements over given axes.
-
-Defined in )code";
-  doc += std::string(__FILE__) + std::string(":L") + std::to_string(line);
-  size_t pos = 0;
-  std::string holder("__op__");
-  while ((pos = doc.find(holder, pos)) != std::string::npos) {
-    doc.replace(pos, holder.length(), op_name);
-    pos += op_name.length();
-  }
-  return doc;
-}
-
-template<>
-void L2NormComputeEx<cpu>(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
-  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
-  const NDArrayStorageType istype = inputs[0].storage_type();
-  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
-  if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
-       param.ord == 2) {
-    // l2 norm on the entire array
-    L2NormComputeSparseImpl<cpu>(s, inputs[0], req[0], outputs[0].data());
-  } else if (istype == kCSRStorage && axis.ndim() == 1 && (axis[0] == 0 || axis[0] == 1) &&
-             !param.keepdims && param.ord == 2) {
-    // l2 norm on a particular axis
-    NDArray output = outputs[0];
-    ReduceCsrImpl<cpu, sq_sum, false>(s, ctx, inputs[0], req[0], &output, axis);
-    CHECK_EQ(outputs[0].storage_type(), kDefaultStorage);
-    SqRootForL2<cpu>(ctx, req[0], outputs[0].data());
-  } else {
-    LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
-  }
-}
-
-MXNET_OPERATOR_REGISTER_REDUCE(sum)
-MXNET_ADD_SPARSE_OP_ALIAS(sum)
-.add_alias("sum_axis")
-.describe(R"code(Computes the sum of array elements over given axes.
-
-.. Note::
-
-  `sum` and `sum_axis` are equivalent.
-  For ndarray of csr storage type summation along axis 0 and axis 1 is supported.
-  Setting keepdims or exclude to True will cause a fallback to dense operator.
-
-Example::
-
-  data = [[[1, 2], [2, 3], [1, 3]],
-          [[1, 4], [4, 3], [5, 2]],
-          [[7, 1], [7, 2], [7, 3]]]
-
-  sum(data, axis=1)
-  [[  4.   8.]
-   [ 10.   9.]
-   [ 21.   6.]]
-
-  sum(data, axis=[1,2])
-  [ 12.  19.  27.]
-
-  data = [[1, 2, 0],
-          [3, 0, 1],
-          [4, 1, 0]]
-
-  csr = cast_storage(data, 'csr')
-
-  sum(csr, axis=0)
-  [ 8.  3.  1.]
-
-  sum(csr, axis=1)
-  [ 3.  4.  5.]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", ReduceAxesOpForwardEx<cpu, mshadow::red::sum>)
-.set_attr<FInferStorageType>("FInferStorageType", ReduceAxesOpForwardStorage)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_sum"});
-
-MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_sum)
-.set_num_inputs(1)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseNone<cpu>);
-
-MXNET_OPERATOR_REGISTER_REDUCE(mean)
-MXNET_ADD_SPARSE_OP_ALIAS(mean)
-.describe(get_reduce_axes_description("mean", __LINE__))
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum, true>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", ReduceAxesOpForwardEx<cpu, mshadow::red::sum, true>)
-.set_attr<FInferStorageType>("FInferStorageType", ReduceAxesOpForwardStorage)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mean"});
-
-MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_mean)
-.set_num_inputs(1)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseNone<cpu, true>);
-
-MXNET_OPERATOR_REGISTER_REDUCE(prod)
-.describe(get_reduce_axes_description("product", __LINE__))
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::product>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_prod" });
-
-MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_prod)
-.set_num_inputs(3)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut< cpu, mshadow_op::rdiv>);
-
-MXNET_OPERATOR_REGISTER_REDUCE(nansum)
-.describe(R"code(Computes the sum of array elements over given axes treating Not a Numbers (``NaN``) as zero.
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nansum>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_nansum" });
-
-MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nansum)
-.set_num_inputs(3)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::nansum_grad>);
-
-MXNET_OPERATOR_REGISTER_REDUCE(nanprod)
-.describe(R"code(Computes the product of array elements over given axes treating Not a Numbers (``NaN``) as one.
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nanprod>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_nanprod" });
-
-MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nanprod)
-.set_num_inputs(3)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::nanprod_grad>);
-
-MXNET_OPERATOR_REGISTER_MINMAX_REDUCE(max)
-.add_alias("max_axis")
-.describe(get_reduce_axes_description("max", __LINE__))
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::maximum>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_max"});
-
-MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_max)
-.set_num_inputs(3)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::eq>);
-
-MXNET_OPERATOR_REGISTER_MINMAX_REDUCE(min)
-.add_alias("min_axis")
-.describe(get_reduce_axes_description("min", __LINE__))
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::minimum>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_min"});
-
-MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_min)
-.set_num_inputs(3)
-.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::eq>);
-
 MXNET_OPERATOR_REGISTER_BROADCAST(broadcast_axis)
 .add_alias("broadcast_axes")
 .describe(R"code(Broadcasts the input array over particular axes.
@@ -320,62 +138,5 @@ For example::
 .set_attr<mxnet::FInferShape>("FInferShape", BroadcastLikeShape)
 .set_attr<FCompute>("FCompute<cpu>", BroadcastCompute<cpu>);
 
-NNVM_REGISTER_OP(norm)
-MXNET_ADD_SPARSE_OP_ALIAS(norm)
-.describe(R"code(Computes the norm on an NDArray.
-
-This operator computes the norm on an NDArray with the specified axis, depending
-on the value of the ord parameter. By default, it computes the L2 norm on the entire
-array. Currently only ord=2 supports sparse ndarrays.
-
-Examples::
-
-  x = [[[1, 2],
-        [3, 4]],
-       [[2, 2],
-        [5, 6]]]
-
-  norm(x, ord=2, axis=1) = [[3.1622777 4.472136 ]
-                            [5.3851647 6.3245554]]
-
-  norm(x, ord=1, axis=1) = [[4., 6.],
-                            [7., 8.]]
-
-  rsp = x.cast_storage('row_sparse')
-
-  norm(rsp) = [5.47722578]
-
-  csr = x.cast_storage('csr')
-
-  norm(csr) = [5.47722578]
-
-)code" ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<NormParam>)
-.set_attr<mxnet::FInferShape>("FInferShape", NormShape)
-.set_attr<nnvm::FInferType>("FInferType", NormType)
-.set_attr<FInferStorageType>("FInferStorageType", LpNormStorageType)
-.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_norm" })
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", LpNormCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", L2NormComputeEx<cpu>)
-.add_argument("data", "NDArray-or-Symbol", "The input")
-.add_arguments(NormParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_norm)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<NormParam>)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", LpNormGradCompute<cpu>);
-
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cu b/src/operator/tensor/broadcast_reduce_op_value.cu
index 2d91c5074496..35b3c0272db8 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cu
+++ b/src/operator/tensor/broadcast_reduce_op_value.cu
@@ -27,70 +27,6 @@
 namespace mxnet {
 namespace op {
 
-template<>
-void L2NormComputeEx<gpu>(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
-  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
-  const NDArrayStorageType istype = inputs[0].storage_type();
-  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
-  if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
-       param.ord == 2) {
-    // l2 norm on the entire array
-    L2NormComputeSparseImpl<gpu>(s, inputs[0], req[0], outputs[0].data());
-  } else {
-    LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
-  }
-}
-
-NNVM_REGISTER_OP(sum)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::sum>);
-
-NNVM_REGISTER_OP(_backward_sum)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseNone<gpu>);
-
-NNVM_REGISTER_OP(mean)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::sum, true>);
-
-NNVM_REGISTER_OP(_backward_mean)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseNone<gpu, true>);
-
-NNVM_REGISTER_OP(prod)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow_op::product>);
-
-NNVM_REGISTER_OP(_backward_prod)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::rdiv>);
-
-NNVM_REGISTER_OP(nansum)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow_op::nansum>);
-
-NNVM_REGISTER_OP(_backward_nansum)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::nansum_grad>);
-
-NNVM_REGISTER_OP(nanprod)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow_op::nanprod>);
-
-NNVM_REGISTER_OP(_backward_nanprod)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::nanprod_grad>);
-
-NNVM_REGISTER_OP(max)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::maximum>);
-
-NNVM_REGISTER_OP(_backward_max)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::eq>);
-
-NNVM_REGISTER_OP(min)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::minimum>);
-
-NNVM_REGISTER_OP(_backward_min)
-.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::eq>);
-
 NNVM_REGISTER_OP(broadcast_axis)
 .set_attr<FCompute>("FCompute<gpu>", BroadcastCompute<gpu>);
 
@@ -103,12 +39,5 @@ NNVM_REGISTER_OP(broadcast_like)
 NNVM_REGISTER_OP(_broadcast_backward)
 .set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::sum>);
 
-NNVM_REGISTER_OP(norm)
-.set_attr<FCompute>("FCompute<gpu>", LpNormCompute<gpu>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", L2NormComputeEx<gpu>);
-
-NNVM_REGISTER_OP(_backward_norm)
-.set_attr<FCompute>("FCompute<gpu>", LpNormGradCompute<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_prod_value.cc b/src/operator/tensor/broadcast_reduce_prod_value.cc
new file mode 100644
index 000000000000..4778865bf11d
--- /dev/null
+++ b/src/operator/tensor/broadcast_reduce_prod_value.cc
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file broadcast_reduce_prod_value.cc
+ * \brief CPU Implementation of broadcast and reduce of prod functions based on value.
+ */
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+MXNET_OPERATOR_REGISTER_REDUCE(prod)
+.describe(get_reduce_axes_description("product", __LINE__))
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::product>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_prod" });
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_prod)
+.set_num_inputs(3)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut< cpu, mshadow_op::rdiv>);
+
+MXNET_OPERATOR_REGISTER_REDUCE(nanprod)
+.describe(R"code(Computes the product of array elements over given axes treating Not a Numbers (``NaN``) as one.
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nanprod>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_nanprod" });
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nanprod)
+.set_num_inputs(3)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::nanprod_grad>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_prod_value.cu b/src/operator/tensor/broadcast_reduce_prod_value.cu
new file mode 100644
index 000000000000..5731de308064
--- /dev/null
+++ b/src/operator/tensor/broadcast_reduce_prod_value.cu
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file broadcast_reduce_prod_value.cu
+ * \brief GPU Implementation of broadcast and reduce prod functions based on value.
+ */
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(prod)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow_op::product>);
+
+NNVM_REGISTER_OP(_backward_prod)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::rdiv>);
+
+NNVM_REGISTER_OP(nanprod)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow_op::nanprod>);
+
+NNVM_REGISTER_OP(_backward_nanprod)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::nanprod_grad>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_sum_value.cc b/src/operator/tensor/broadcast_reduce_sum_value.cc
new file mode 100644
index 000000000000..c5c9f5cb48e4
--- /dev/null
+++ b/src/operator/tensor/broadcast_reduce_sum_value.cc
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file broadcast_reduce_sum_value.cc
+ * \brief CPU Implementation of broadcast and reduce sum (and related) functions based on value.
+ */
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+MXNET_OPERATOR_REGISTER_REDUCE(sum)
+MXNET_ADD_SPARSE_OP_ALIAS(sum)
+.add_alias("sum_axis")
+.describe(R"code(Computes the sum of array elements over given axes.
+
+.. Note::
+
+  `sum` and `sum_axis` are equivalent.
+  For ndarray of csr storage type summation along axis 0 and axis 1 is supported.
+  Setting keepdims or exclude to True will cause a fallback to dense operator.
+
+Example::
+
+  data = [[[1, 2], [2, 3], [1, 3]],
+          [[1, 4], [4, 3], [5, 2]],
+          [[7, 1], [7, 2], [7, 3]]]
+
+  sum(data, axis=1)
+  [[  4.   8.]
+   [ 10.   9.]
+   [ 21.   6.]]
+
+  sum(data, axis=[1,2])
+  [ 12.  19.  27.]
+
+  data = [[1, 2, 0],
+          [3, 0, 1],
+          [4, 1, 0]]
+
+  csr = cast_storage(data, 'csr')
+
+  sum(csr, axis=0)
+  [ 8.  3.  1.]
+
+  sum(csr, axis=1)
+  [ 3.  4.  5.]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ReduceAxesOpForwardEx<cpu, mshadow::red::sum>)
+.set_attr<FInferStorageType>("FInferStorageType", ReduceAxesOpForwardStorage)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_sum"});
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_sum)
+.set_num_inputs(1)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseNone<cpu>);
+
+MXNET_OPERATOR_REGISTER_REDUCE(mean)
+MXNET_ADD_SPARSE_OP_ALIAS(mean)
+.describe(get_reduce_axes_description("mean", __LINE__))
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum, true>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ReduceAxesOpForwardEx<cpu, mshadow::red::sum, true>)
+.set_attr<FInferStorageType>("FInferStorageType", ReduceAxesOpForwardStorage)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mean"});
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_mean)
+.set_num_inputs(1)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseNone<cpu, true>);
+
+MXNET_OPERATOR_REGISTER_REDUCE(nansum)
+.describe(R"code(Computes the sum of array elements over given axes treating Not a Numbers (``NaN``) as zero.
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nansum>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_nansum" });
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nansum)
+.set_num_inputs(3)
+.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::nansum_grad>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_sum_value.cu b/src/operator/tensor/broadcast_reduce_sum_value.cu
new file mode 100644
index 000000000000..2385d36f35b0
--- /dev/null
+++ b/src/operator/tensor/broadcast_reduce_sum_value.cu
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file broadcast_reduce_sum_value.cu
+ * \brief GPU Implementation of broadcast and reduce sum (and related) functions based on value.
+ */
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(sum)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::sum>);
+
+NNVM_REGISTER_OP(_backward_sum)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseNone<gpu>);
+
+NNVM_REGISTER_OP(mean)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow::red::sum, true>);
+
+NNVM_REGISTER_OP(_backward_mean)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseNone<gpu, true>);
+
+NNVM_REGISTER_OP(nansum)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesCompute<gpu, mshadow_op::nansum>);
+
+NNVM_REGISTER_OP(_backward_nansum)
+.set_attr<FCompute>("FCompute<gpu>", ReduceAxesBackwardUseInOut<gpu, mshadow_op::nansum_grad>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index c293ee6e22bd..117cfa96518a 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -703,55 +703,6 @@ The storage type of ``negative`` output depends upon the input storage type:
 )code")
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
 
-// reciprocal
-MXNET_OPERATOR_REGISTER_UNARY(reciprocal)
-.describe(R"code(Returns the reciprocal of the argument, element-wise.
-
-Calculates 1/x.
-
-Example::
-
-    reciprocal([-2, 1, 3, 1.6, 0.2]) = [-0.5, 1.0, 0.33333334, 0.625, 5.0]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::reciprocal>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_reciprocal"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_reciprocal)
-.set_attr<FCompute>("FCompute<cpu>",
-  ElemwiseBinaryOp::Compute<cpu, unary_bwd<mshadow_op::reciprocal_grad> >)
-.set_attr<nnvm::FGradient>("FGradient",
-  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-    // ograds[0]: dL/dxgrad
-    // inputs[0]: dL/dy
-    // inputs[1]: x
-    // f(x) = y = 1/x
-    // f'(x) = -1/x^2
-    // f''(x) = 2/x^3 = -2 * (f'(x) * f(x))
-
-    const std::unordered_map<std::string, std::string> args = {{"scalar", "-2.0"}};
-
-    auto dydx_mul_dldy = nnvm::NodeEntry{n};  // f'(x) * head_grads
-    auto dydx = MakeNode("elemwise_div", n->attrs.name + "_dydx",
-                         {dydx_mul_dldy, n->inputs[0]}, nullptr, &n);
-    auto fx = MakeNode("reciprocal", n->attrs.name + "_fx",
-                       {n->inputs[1]}, nullptr, &n);
-
-    auto d2ydx2_mid = MakeNode("elemwise_mul", n->attrs.name + "_d2ydx2_mid",
-                               {dydx_mul_dldy, nnvm::NodeEntry{fx}}, nullptr, &n);
-
-    auto d2ydx2 = MakeNode("_mul_scalar", n->attrs.name + "_d2ydx2",
-                           {nnvm::NodeEntry{d2ydx2_mid}}, &args, &n);
-
-    std::vector<nnvm::NodeEntry> ret;
-
-    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
-                             {ograds[0], nnvm::NodeEntry{dydx}}, nullptr, &n));
-    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
-                             {ograds[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n));
-    return ret;
-});
-
 // abs
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(abs, cpu, mshadow_op::abs)
 .describe(R"code(Returns element-wise absolute value of the input.
@@ -923,116 +874,6 @@ The storage type of ``fix`` output depends upon the input storage type:
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
-// square
-#if MSHADOW_USE_MKL == 1
-MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square, mkl_func::square)
-.describe(R"code(Returns element-wise squared value of the input.
-
-.. math::
-   square(x) = x^2
-
-Example::
-
-   square([2, 3, 4]) = [4, 9, 16]
-
-The storage type of ``square`` output depends upon the input storage type:
-
-   - square(default) = default
-   - square(row_sparse) = row_sparse
-   - square(csr) = csr
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
-#else
-MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square)
-.describe(R"code(Returns element-wise squared value of the input.
-
-.. math::
-   square(x) = x^2
-
-Example::
-
-   square([2, 3, 4]) = [4, 9, 16]
-
-The storage type of ``square`` output depends upon the input storage type:
-
-   - square(default) = default
-   - square(row_sparse) = row_sparse
-   - square(csr) = csr
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
-#endif
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_square,
-                                               unary_bwd<mshadow_op::square_grad>);
-
-// sqrt
-MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(sqrt, cpu, mshadow_op::square_root)
-.describe(R"code(Returns element-wise square-root value of the input.
-
-.. math::
-   \textrm{sqrt}(x) = \sqrt{x}
-
-Example::
-
-   sqrt([4, 9, 16]) = [2, 3, 4]
-
-The storage type of ``sqrt`` output depends upon the input storage type:
-
-   - sqrt(default) = default
-   - sqrt(row_sparse) = row_sparse
-   - sqrt(csr) = csr
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sqrt"});
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_sqrt,
-                                                  unary_bwd<mshadow_op::square_root_grad>);
-
-// rsqrt
-MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(rsqrt, cpu, mshadow_op::reciprocal_square_root)
-MXNET_ADD_SPARSE_OP_ALIAS(rsqrt)
-.describe(R"code(Returns element-wise inverse square-root value of the input.
-
-.. math::
-   rsqrt(x) = 1/\sqrt{x}
-
-Example::
-
-   rsqrt([4,9,16]) = [0.5, 0.33333334, 0.25]
-
-The storage type of ``rsqrt`` output is always dense
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rsqrt"});
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(
-  _backward_rsqrt, unary_bwd<mshadow_op::reciprocal_square_root_grad>);
-
-// cbrt
-MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(cbrt, cpu, mshadow_op::cube_root)
-.describe(R"code(Returns element-wise cube-root value of the input.
-
-.. math::
-   cbrt(x) = \sqrt[3]{x}
-
-Example::
-
-   cbrt([1, 8, -125]) = [1, 2, -5]
-
-The storage type of ``cbrt`` output depends upon the input storage type:
-
-   - cbrt(default) = default
-   - cbrt(row_sparse) = row_sparse
-   - cbrt(csr) = csr
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_cbrt"});
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_cbrt,
-                                                  unary_bwd<mshadow_op::cube_root_grad>);
-
 // erf
 MXNET_OPERATOR_REGISTER_UNARY(erf)
 .describe(R"code(Returns element-wise gauss error function of the input.
@@ -1070,224 +911,6 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_erfinv)
 .set_attr<FCompute>("FCompute<cpu>",
                     ElemwiseBinaryOp::Compute<cpu, unary_bwd<mshadow_op::erfinv_grad>>);
 
-// rcbrt
-MXNET_OPERATOR_REGISTER_UNARY(rcbrt)
-.describe(R"code(Returns element-wise inverse cube-root value of the input.
-
-.. math::
-   rcbrt(x) = 1/\sqrt[3]{x}
-
-Example::
-
-   rcbrt([1,8,-125]) = [1.0, 0.5, -0.2]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::reciprocal_cube_root>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rcbrt"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_rcbrt)
-.set_attr<FCompute>("FCompute<cpu>",
-                    ElemwiseBinaryOp::Compute<cpu,
-                      unary_bwd<mshadow_op::reciprocal_cube_root_grad>>);
-
-// exp
-#if MSHADOW_USE_MKL == 1
-MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp, mkl_func::exp)
-MXNET_ADD_SPARSE_OP_ALIAS(exp)
-.describe(R"code(Returns element-wise exponential value of the input.
-
-.. math::
-   exp(x) = e^x \approx 2.718^x
-
-Example::
-
-   exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
-
-The storage type of ``exp`` output is always dense
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
-#else
-MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp)
-MXNET_ADD_SPARSE_OP_ALIAS(exp)
-.describe(R"code(Returns element-wise exponential value of the input.
-
-.. math::
-   exp(x) = e^x \approx 2.718^x
-
-Example::
-
-   exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
-
-The storage type of ``exp`` output is always dense
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
-#endif
-
-// log
-MXNET_OPERATOR_REGISTER_UNARY(log)
-MXNET_ADD_SPARSE_OP_ALIAS(log)
-.describe(R"code(Returns element-wise Natural logarithmic value of the input.
-
-The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
-
-The storage type of ``log`` output is always dense
-
-)code" ADD_FILELINE)
-#if MSHADOW_USE_MKL == 1
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
-#else
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::log>)
-#endif    // MSHADOW_USE_MKL == 1
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
-
-// log10
-MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log10, cpu, mshadow_op::log10)
-MXNET_ADD_SPARSE_OP_ALIAS(log10)
-.describe(R"code(Returns element-wise Base-10 logarithmic value of the input.
-
-``10**log10(x) = x``
-
-The storage type of ``log10`` output is always dense
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log10"});
-
-// log2
-MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log2, cpu, mshadow_op::log2)
-MXNET_ADD_SPARSE_OP_ALIAS(log2)
-.describe(R"code(Returns element-wise Base-2 logarithmic value of the input.
-
-``2**log2(x) = x``
-
-The storage type of ``log2`` output is always dense
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log2"});
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log,
-                                                  unary_bwd<mshadow_op::log_grad>)
-.set_attr<nnvm::FGradient>("FGradient",
-  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-    // ograds[0]: dL/dxgrad
-    // inputs[0]: dL/dy
-    // inputs[1]: x
-    // f(x) = y = log(x)
-    // f'(x) = 1/x
-    // f''(x) = -1 * (f'(x) * f'(x))
-    auto dydx_mul_dldy = nnvm::NodeEntry{n};  // f'(x) * head_grads
-    auto dlogx = MakeNode("reciprocal", n->attrs.name + "_dlogx",
-                            {n->inputs[1]}, nullptr, &n);
-    auto d2ydx2_mid = MakeNode("elemwise_mul", n->attrs.name + "_d2ydx2_mid",
-                            {dydx_mul_dldy, nnvm::NodeEntry{dlogx}}, nullptr, &n);
-    auto d2ydx2 = MakeNode("negative", n->attrs.name + "_d2ydx2",
-                        {nnvm::NodeEntry{d2ydx2_mid}}, nullptr, &n);
-
-    std::vector<nnvm::NodeEntry> ret;
-
-    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
-                             {ograds[0], nnvm::NodeEntry{dlogx}}, nullptr, &n));
-    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
-                             {ograds[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n));
-    return ret;
-  });
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log10,
-                                                  unary_bwd<mshadow_op::log10_grad>)
-.set_attr<nnvm::FGradient>("FGradient",
-  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-    // ograds[0]: dL/dxgrad
-    // inputs[0]: dL/dy
-    // inputs[1]: x
-    // f(x) = y = log10(x)
-    // f'(x) = 1 / (log(10) * x)
-    // f''(x) = -1 * (f'(x) * 1/x)
-    auto dydx_mul_dldy = nnvm::NodeEntry{n};  // f'(x) * head_grads
-    auto dydx = MakeNode("elemwise_div", n->attrs.name + "_dydx",
-                            {n->inputs[0]}, nullptr, &n);
-    auto dlogx = MakeNode("reciprocal", n->attrs.name + "_dlogx",
-                            {n->inputs[1]}, nullptr, &n);
-    auto d2ydx2_mid = MakeNode("elemwise_mul", n->attrs.name + "_d2ydx2_mid",
-                            {dydx_mul_dldy, nnvm::NodeEntry{dlogx}}, nullptr, &n);
-    auto d2ydx2 = MakeNode("negative", n->attrs.name + "_d2ydx2",
-                        {nnvm::NodeEntry{d2ydx2_mid}}, nullptr, &n);
-
-    std::vector<nnvm::NodeEntry> ret;
-
-    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
-                             {ograds[0], nnvm::NodeEntry{dydx}}, nullptr, &n));
-    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
-                             {ograds[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n));
-    return ret;
-  });
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log2,
-                                                  unary_bwd<mshadow_op::log2_grad>)
-.set_attr<nnvm::FGradient>("FGradient",
-  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-    // ograds[0]: dL/dxgrad
-    // inputs[0]: dL/dy
-    // inputs[1]: x
-    // f(x) = y = log2(x)
-    // f'(x) = 1 / (log(2) * x)
-    // f''(x) = -1 * (f'(x) * 1/x)
-    auto dydx_mul_dldy = nnvm::NodeEntry{n};  // f'(x) * head_grads
-    auto dydx = MakeNode("elemwise_div", n->attrs.name + "_dydx",
-                            {n->inputs[0]}, nullptr, &n);
-    auto dlogx = MakeNode("reciprocal", n->attrs.name + "_dlogx",
-                            {n->inputs[1]}, nullptr, &n);
-    auto d2ydx2_mid = MakeNode("elemwise_mul", n->attrs.name + "_d2ydx2_mid",
-                            {dydx_mul_dldy, nnvm::NodeEntry{dlogx}}, nullptr, &n);
-    auto d2ydx2 = MakeNode("negative", n->attrs.name + "_d2ydx2",
-                        {nnvm::NodeEntry{d2ydx2_mid}}, nullptr, &n);
-
-    std::vector<nnvm::NodeEntry> ret;
-
-    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
-                             {ograds[0], nnvm::NodeEntry{dydx}}, nullptr, &n));
-    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
-                             {ograds[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n));
-    return ret;
-  });
-
-// log1p
-MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(log1p, cpu, mshadow_op::log1p)
-.describe(R"code(Returns element-wise ``log(1 + x)`` value of the input.
-
-This function is more accurate than ``log(1 + x)``  for small ``x`` so that
-:math:`1+x\approx 1`
-
-The storage type of ``log1p`` output depends upon the input storage type:
-
-   - log1p(default) = default
-   - log1p(row_sparse) = row_sparse
-   - log1p(csr) = csr
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log1p"});
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log1p,
-                                                  unary_bwd<mshadow_op::log1p_grad>);
-
-// expm1
-MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(expm1, cpu, mshadow_op::expm1)
-.describe(R"code(Returns ``exp(x) - 1`` computed element-wise on the input.
-
-This function provides greater precision than ``exp(x) - 1`` for small values of ``x``.
-
-The storage type of ``expm1`` output depends upon the input storage type:
-
-   - expm1(default) = default
-   - expm1(row_sparse) = row_sparse
-   - expm1(csr) = csr
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_expm1"});
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_expm1, unary_bwd<mshadow_op::exp>);
-
-
 // gamma
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(gamma, cpu, mshadow_op::gamma)
 MXNET_ADD_SPARSE_OP_ALIAS(gamma)
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index 642cb0e6e48b..e5b60b1726e6 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -154,14 +154,6 @@ NNVM_REGISTER_OP(negative)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::negation>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::negation>);
 
-// reciprocal
-NNVM_REGISTER_OP(reciprocal)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal>);
-
-NNVM_REGISTER_OP(_backward_reciprocal)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::reciprocal_grad> >);
-
 // abs
 NNVM_REGISTER_OP(abs)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::abs>)
@@ -204,104 +196,11 @@ NNVM_REGISTER_OP(rint)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::rint>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::rint>);
 
-
 // fix
 NNVM_REGISTER_OP(fix)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::fix>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::fix>);
 
-
-// square
-NNVM_REGISTER_OP(square)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::square>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::square>);
-
-NNVM_REGISTER_OP(_backward_square)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::square_grad> >);
-
-// sqrt
-NNVM_REGISTER_OP(sqrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::square_root>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::square_root>);
-
-
-NNVM_REGISTER_OP(_backward_sqrt)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::square_root_grad> >);
-
-// rsqrt
-NNVM_REGISTER_OP(rsqrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal_square_root>);
-
-NNVM_REGISTER_OP(_backward_rsqrt)
-.set_attr<FCompute>("FCompute<gpu>",
-  ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::reciprocal_square_root_grad> >);
-
-// cbrt
-NNVM_REGISTER_OP(cbrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::cube_root>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::cube_root>);
-
-
-NNVM_REGISTER_OP(_backward_cbrt)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::cube_root_grad> >);
-
-// rcbrt
-NNVM_REGISTER_OP(rcbrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal_cube_root>);
-
-NNVM_REGISTER_OP(_backward_rcbrt)
-.set_attr<FCompute>("FCompute<gpu>",
-  ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::reciprocal_cube_root_grad> >);
-
-// exp
-NNVM_REGISTER_OP(exp)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::exp>);
-
-// log
-NNVM_REGISTER_OP(log)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log>);
-
-// log10
-NNVM_REGISTER_OP(log10)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log10>);
-
-// log2
-NNVM_REGISTER_OP(log2)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log2>);
-
-NNVM_REGISTER_OP(_backward_log)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::log_grad> >);
-
-NNVM_REGISTER_OP(_backward_log10)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::log10_grad> >);
-
-NNVM_REGISTER_OP(_backward_log2)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::log2_grad> >);
-
-// log1p
-NNVM_REGISTER_OP(log1p)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log1p>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::log1p>);
-
-NNVM_REGISTER_OP(_backward_log1p)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::log1p_grad> >);
-
-// expm1
-NNVM_REGISTER_OP(expm1)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::expm1>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::expm1>);
-
-NNVM_REGISTER_OP(_backward_expm1)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::exp> >);
-
 // gamma
 NNVM_REGISTER_OP(gamma)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::gamma>);
diff --git a/src/operator/tensor/elemwise_unary_op_logexp.cc b/src/operator/tensor/elemwise_unary_op_logexp.cc
new file mode 100644
index 000000000000..65394826276f
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op_logexp.cc
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_unary_op_logexp.cc
+ * \brief CPU Implementation of elementwise log and exp function.
+ */
+#include <mxnet/base.h>
+#include "elemwise_unary_op.h"
+#include "./elemwise_binary_op-inl.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+
+namespace mxnet {
+namespace op {
+
+// exp
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp, mkl_func::exp)
+MXNET_ADD_SPARSE_OP_ALIAS(exp)
+.describe(R"code(Returns element-wise exponential value of the input.
+
+.. math::
+   exp(x) = e^x \approx 2.718^x
+
+Example::
+
+   exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
+
+The storage type of ``exp`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+#else
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp)
+MXNET_ADD_SPARSE_OP_ALIAS(exp)
+.describe(R"code(Returns element-wise exponential value of the input.
+
+.. math::
+   exp(x) = e^x \approx 2.718^x
+
+Example::
+
+   exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
+
+The storage type of ``exp`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+#endif
+
+// log
+MXNET_OPERATOR_REGISTER_UNARY(log)
+MXNET_ADD_SPARSE_OP_ALIAS(log)
+.describe(R"code(Returns element-wise Natural logarithmic value of the input.
+
+The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
+
+The storage type of ``log`` output is always dense
+
+)code" ADD_FILELINE)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
+#else
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::log>)
+#endif    // MSHADOW_USE_MKL == 1
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
+
+// log10
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log10, cpu, mshadow_op::log10)
+MXNET_ADD_SPARSE_OP_ALIAS(log10)
+.describe(R"code(Returns element-wise Base-10 logarithmic value of the input.
+
+``10**log10(x) = x``
+
+The storage type of ``log10`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log10"});
+
+// log2
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log2, cpu, mshadow_op::log2)
+MXNET_ADD_SPARSE_OP_ALIAS(log2)
+.describe(R"code(Returns element-wise Base-2 logarithmic value of the input.
+
+``2**log2(x) = x``
+
+The storage type of ``log2`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log2"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log,
+                                                  unary_bwd<mshadow_op::log_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    // ograds[0]: dL/dxgrad
+    // inputs[0]: dL/dy
+    // inputs[1]: x
+    // f(x) = y = log(x)
+    // f'(x) = 1/x
+    // f''(x) = -1 * (f'(x) * f'(x))
+    auto dydx_mul_dldy = nnvm::NodeEntry{n};  // f'(x) * head_grads
+    auto dlogx = MakeNode("reciprocal", n->attrs.name + "_dlogx",
+                            {n->inputs[1]}, nullptr, &n);
+    auto d2ydx2_mid = MakeNode("elemwise_mul", n->attrs.name + "_d2ydx2_mid",
+                            {dydx_mul_dldy, nnvm::NodeEntry{dlogx}}, nullptr, &n);
+    auto d2ydx2 = MakeNode("negative", n->attrs.name + "_d2ydx2",
+                        {nnvm::NodeEntry{d2ydx2_mid}}, nullptr, &n);
+
+    std::vector<nnvm::NodeEntry> ret;
+
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                             {ograds[0], nnvm::NodeEntry{dlogx}}, nullptr, &n));
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
+                             {ograds[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n));
+    return ret;
+  });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log10,
+                                                  unary_bwd<mshadow_op::log10_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    // ograds[0]: dL/dxgrad
+    // inputs[0]: dL/dy
+    // inputs[1]: x
+    // f(x) = y = log10(x)
+    // f'(x) = 1 / (log(10) * x)
+    // f''(x) = -1 * (f'(x) * 1/x)
+    auto dydx_mul_dldy = nnvm::NodeEntry{n};  // f'(x) * head_grads
+    auto dydx = MakeNode("elemwise_div", n->attrs.name + "_dydx",
+                            {n->inputs[0]}, nullptr, &n);
+    auto dlogx = MakeNode("reciprocal", n->attrs.name + "_dlogx",
+                            {n->inputs[1]}, nullptr, &n);
+    auto d2ydx2_mid = MakeNode("elemwise_mul", n->attrs.name + "_d2ydx2_mid",
+                            {dydx_mul_dldy, nnvm::NodeEntry{dlogx}}, nullptr, &n);
+    auto d2ydx2 = MakeNode("negative", n->attrs.name + "_d2ydx2",
+                        {nnvm::NodeEntry{d2ydx2_mid}}, nullptr, &n);
+
+    std::vector<nnvm::NodeEntry> ret;
+
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                             {ograds[0], nnvm::NodeEntry{dydx}}, nullptr, &n));
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
+                             {ograds[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n));
+    return ret;
+  });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log2,
+                                                  unary_bwd<mshadow_op::log2_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    // ograds[0]: dL/dxgrad
+    // inputs[0]: dL/dy
+    // inputs[1]: x
+    // f(x) = y = log2(x)
+    // f'(x) = 1 / (log(2) * x)
+    // f''(x) = -1 * (f'(x) * 1/x)
+    auto dydx_mul_dldy = nnvm::NodeEntry{n};  // f'(x) * head_grads
+    auto dydx = MakeNode("elemwise_div", n->attrs.name + "_dydx",
+                            {n->inputs[0]}, nullptr, &n);
+    auto dlogx = MakeNode("reciprocal", n->attrs.name + "_dlogx",
+                            {n->inputs[1]}, nullptr, &n);
+    auto d2ydx2_mid = MakeNode("elemwise_mul", n->attrs.name + "_d2ydx2_mid",
+                            {dydx_mul_dldy, nnvm::NodeEntry{dlogx}}, nullptr, &n);
+    auto d2ydx2 = MakeNode("negative", n->attrs.name + "_d2ydx2",
+                        {nnvm::NodeEntry{d2ydx2_mid}}, nullptr, &n);
+
+    std::vector<nnvm::NodeEntry> ret;
+
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                             {ograds[0], nnvm::NodeEntry{dydx}}, nullptr, &n));
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
+                             {ograds[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n));
+    return ret;
+  });
+
+// log1p
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(log1p, cpu, mshadow_op::log1p)
+.describe(R"code(Returns element-wise ``log(1 + x)`` value of the input.
+
+This function is more accurate than ``log(1 + x)``  for small ``x`` so that
+:math:`1+x\approx 1`
+
+The storage type of ``log1p`` output depends upon the input storage type:
+
+   - log1p(default) = default
+   - log1p(row_sparse) = row_sparse
+   - log1p(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log1p"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log1p,
+                                                  unary_bwd<mshadow_op::log1p_grad>);
+
+// expm1
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(expm1, cpu, mshadow_op::expm1)
+.describe(R"code(Returns ``exp(x) - 1`` computed element-wise on the input.
+
+This function provides greater precision than ``exp(x) - 1`` for small values of ``x``.
+
+The storage type of ``expm1`` output depends upon the input storage type:
+
+   - expm1(default) = default
+   - expm1(row_sparse) = row_sparse
+   - expm1(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_expm1"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_expm1, unary_bwd<mshadow_op::exp>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_logexp.cu b/src/operator/tensor/elemwise_unary_op_logexp.cu
new file mode 100644
index 000000000000..febc1914feb7
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op_logexp.cu
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_unary_op_logexp.cu
+ * \brief GPU Implementation of unary log and exp functions.
+ */
+#include "./elemwise_binary_op.h"
+
+namespace mxnet {
+namespace op {
+
+// exp
+NNVM_REGISTER_OP(exp)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::exp>);
+
+// log
+NNVM_REGISTER_OP(log)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log>);
+
+// log10
+NNVM_REGISTER_OP(log10)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log10>);
+
+// log2
+NNVM_REGISTER_OP(log2)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log2>);
+
+NNVM_REGISTER_OP(_backward_log)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::log_grad> >);
+
+NNVM_REGISTER_OP(_backward_log10)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::log10_grad> >);
+
+NNVM_REGISTER_OP(_backward_log2)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::log2_grad> >);
+
+// log1p
+NNVM_REGISTER_OP(log1p)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log1p>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::log1p>);
+
+NNVM_REGISTER_OP(_backward_log1p)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::log1p_grad> >);
+
+// expm1
+NNVM_REGISTER_OP(expm1)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::expm1>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::expm1>);
+
+NNVM_REGISTER_OP(_backward_expm1)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::exp> >);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_pow.cc b/src/operator/tensor/elemwise_unary_op_pow.cc
new file mode 100644
index 000000000000..f22dabc7201a
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op_pow.cc
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_unary_op_pow.cc
+ * \brief CPU Implementation of elementwise power (x^k for fixed k) function.
+ */
+#include <mxnet/base.h>
+#include "elemwise_unary_op.h"
+#include "./elemwise_binary_op-inl.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+
+namespace mxnet {
+namespace op {
+
+// reciprocal
+MXNET_OPERATOR_REGISTER_UNARY(reciprocal)
+.describe(R"code(Returns the reciprocal of the argument, element-wise.
+
+Calculates 1/x.
+
+Example::
+
+    reciprocal([-2, 1, 3, 1.6, 0.2]) = [-0.5, 1.0, 0.33333334, 0.625, 5.0]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::reciprocal>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_reciprocal"});
+
+MXNET_OPERATOR_REGISTER_BINARY(_backward_reciprocal)
+.set_attr<FCompute>("FCompute<cpu>",
+  ElemwiseBinaryOp::Compute<cpu, unary_bwd<mshadow_op::reciprocal_grad> >)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    // ograds[0]: dL/dxgrad
+    // inputs[0]: dL/dy
+    // inputs[1]: x
+    // f(x) = y = 1/x
+    // f'(x) = -1/x^2
+    // f''(x) = 2/x^3 = -2 * (f'(x) * f(x))
+
+    const std::unordered_map<std::string, std::string> args = {{"scalar", "-2.0"}};
+
+    auto dydx_mul_dldy = nnvm::NodeEntry{n};  // f'(x) * head_grads
+    auto dydx = MakeNode("elemwise_div", n->attrs.name + "_dydx",
+                         {dydx_mul_dldy, n->inputs[0]}, nullptr, &n);
+    auto fx = MakeNode("reciprocal", n->attrs.name + "_fx",
+                       {n->inputs[1]}, nullptr, &n);
+
+    auto d2ydx2_mid = MakeNode("elemwise_mul", n->attrs.name + "_d2ydx2_mid",
+                               {dydx_mul_dldy, nnvm::NodeEntry{fx}}, nullptr, &n);
+
+    auto d2ydx2 = MakeNode("_mul_scalar", n->attrs.name + "_d2ydx2",
+                           {nnvm::NodeEntry{d2ydx2_mid}}, &args, &n);
+
+    std::vector<nnvm::NodeEntry> ret;
+
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                             {ograds[0], nnvm::NodeEntry{dydx}}, nullptr, &n));
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
+                             {ograds[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n));
+    return ret;
+});
+
+// square
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square, mkl_func::square)
+.describe(R"code(Returns element-wise squared value of the input.
+
+.. math::
+   square(x) = x^2
+
+Example::
+
+   square([2, 3, 4]) = [4, 9, 16]
+
+The storage type of ``square`` output depends upon the input storage type:
+
+   - square(default) = default
+   - square(row_sparse) = row_sparse
+   - square(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+#else
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square)
+.describe(R"code(Returns element-wise squared value of the input.
+
+.. math::
+   square(x) = x^2
+
+Example::
+
+   square([2, 3, 4]) = [4, 9, 16]
+
+The storage type of ``square`` output depends upon the input storage type:
+
+   - square(default) = default
+   - square(row_sparse) = row_sparse
+   - square(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+#endif
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_square,
+                                               unary_bwd<mshadow_op::square_grad>);
+
+// sqrt
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(sqrt, cpu, mshadow_op::square_root)
+.describe(R"code(Returns element-wise square-root value of the input.
+
+.. math::
+   \textrm{sqrt}(x) = \sqrt{x}
+
+Example::
+
+   sqrt([4, 9, 16]) = [2, 3, 4]
+
+The storage type of ``sqrt`` output depends upon the input storage type:
+
+   - sqrt(default) = default
+   - sqrt(row_sparse) = row_sparse
+   - sqrt(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sqrt"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_sqrt,
+                                                  unary_bwd<mshadow_op::square_root_grad>);
+
+// rsqrt
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(rsqrt, cpu, mshadow_op::reciprocal_square_root)
+MXNET_ADD_SPARSE_OP_ALIAS(rsqrt)
+.describe(R"code(Returns element-wise inverse square-root value of the input.
+
+.. math::
+   rsqrt(x) = 1/\sqrt{x}
+
+Example::
+
+   rsqrt([4,9,16]) = [0.5, 0.33333334, 0.25]
+
+The storage type of ``rsqrt`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rsqrt"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(
+  _backward_rsqrt, unary_bwd<mshadow_op::reciprocal_square_root_grad>);
+
+// cbrt
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(cbrt, cpu, mshadow_op::cube_root)
+.describe(R"code(Returns element-wise cube-root value of the input.
+
+.. math::
+   cbrt(x) = \sqrt[3]{x}
+
+Example::
+
+   cbrt([1, 8, -125]) = [1, 2, -5]
+
+The storage type of ``cbrt`` output depends upon the input storage type:
+
+   - cbrt(default) = default
+   - cbrt(row_sparse) = row_sparse
+   - cbrt(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_cbrt"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_cbrt,
+                                                  unary_bwd<mshadow_op::cube_root_grad>);
+
+// rcbrt
+MXNET_OPERATOR_REGISTER_UNARY(rcbrt)
+.describe(R"code(Returns element-wise inverse cube-root value of the input.
+
+.. math::
+   rcbrt(x) = 1/\sqrt[3]{x}
+
+Example::
+
+   rcbrt([1,8,-125]) = [1.0, 0.5, -0.2]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::reciprocal_cube_root>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rcbrt"});
+
+MXNET_OPERATOR_REGISTER_BINARY(_backward_rcbrt)
+.set_attr<FCompute>("FCompute<cpu>",
+                    ElemwiseBinaryOp::Compute<cpu,
+                      unary_bwd<mshadow_op::reciprocal_cube_root_grad>>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_pow.cu b/src/operator/tensor/elemwise_unary_op_pow.cu
new file mode 100644
index 000000000000..4dbdf349cdb0
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op_pow.cu
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_unary_op_pow.cu
+ * \brief GPU Implementation of power (x^k for fixed k) functions.
+ */
+#include "./elemwise_binary_op.h"
+
+namespace mxnet {
+namespace op {
+
+// reciprocal
+NNVM_REGISTER_OP(reciprocal)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal>);
+
+NNVM_REGISTER_OP(_backward_reciprocal)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::reciprocal_grad> >);
+
+// square
+NNVM_REGISTER_OP(square)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::square>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::square>);
+
+NNVM_REGISTER_OP(_backward_square)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::square_grad> >);
+
+// sqrt
+NNVM_REGISTER_OP(sqrt)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::square_root>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::square_root>);
+
+NNVM_REGISTER_OP(_backward_sqrt)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::square_root_grad> >);
+
+// rsqrt
+NNVM_REGISTER_OP(rsqrt)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal_square_root>);
+
+NNVM_REGISTER_OP(_backward_rsqrt)
+.set_attr<FCompute>("FCompute<gpu>",
+  ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::reciprocal_square_root_grad> >);
+
+// cbrt
+NNVM_REGISTER_OP(cbrt)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::cube_root>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::cube_root>);
+
+
+NNVM_REGISTER_OP(_backward_cbrt)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::cube_root_grad> >);
+
+// rcbrt
+NNVM_REGISTER_OP(rcbrt)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal_cube_root>);
+
+NNVM_REGISTER_OP(_backward_rcbrt)
+.set_attr<FCompute>("FCompute<gpu>",
+  ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::reciprocal_cube_root_grad> >);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 8fdca619a5c8..720c25d2711e 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -893,14 +893,15 @@ def compute_expected_prob():
 def test_shuffle():
     def check_first_axis_shuffle(arr):
         stride = int(arr.size / arr.shape[0])
-        column0 = arr.reshape((arr.size,))[::stride].sort()
+        column0 = arr.reshape((arr.size,))[::stride]
         seq = mx.nd.arange(0, arr.size - stride + 1, stride, ctx=arr.context)
-        assert (column0 == seq).prod() == 1
-        for i in range(arr.shape[0]):
-            subarr = arr[i].reshape((arr[i].size,))
-            start = subarr[0].asscalar()
-            seq = mx.nd.arange(start, start + stride, ctx=arr.context)
-            assert (subarr == seq).prod() == 1
+        assert (column0.sort() == seq).prod() == 1
+        # Check for ascending flattened-row sequences for 2D or greater inputs.
+        if stride > 1:
+            ascending_seq = mx.nd.arange(0, stride, ctx=arr.context)
+            equalized_columns = arr.reshape((arr.shape[0], stride)) - ascending_seq
+            column0_2d = column0.reshape((arr.shape[0],1))
+            assert (column0_2d == equalized_columns).prod() == 1
 
     # This tests that the shuffling is along the first axis with `repeat1` number of shufflings
     # and the outcomes are uniformly distributed with `repeat2` number of shufflings.