diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index ac97c20a8a5a..d00fbec9ffe9 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -3605,8 +3605,8 @@ def median(a, axis=None, out=None, keepdims=False):
     -------
     median : ndarray
         A new array holding the result. If the input contains integers
-        or floats smaller than ``float64``, then the output data-type is
-        ``np.float64``.  Otherwise, the data-type of the output is the
+        or floats smaller than ``float32``, then the output data-type is
+        ``np.float32``.  Otherwise, the data-type of the output is the
         same as that of the input. If `out` is specified, that array is
         returned instead.
 
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index d66650884066..9a3d5d21915b 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -5368,8 +5368,8 @@ def median(a, axis=None, out=None, keepdims=False):
     -------
     median : ndarray
         A new array holding the result. If the input contains integers
-        or floats smaller than ``float64``, then the output data-type is
-        ``np.float64``.  Otherwise, the data-type of the output is the
+        or floats smaller than ``float32``, then the output data-type is
+        ``np.float32``.  Otherwise, the data-type of the output is the
         same as that of the input. If `out` is specified, that array is
         returned instead.
 
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index c15760b4dc58..3585c150ae54 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -3457,8 +3457,8 @@ def median(a, axis=None, out=None, keepdims=False):
     -------
     median :  _Symbol
         A new array holding the result. If the input contains integers
-        or floats smaller than ``float64``, then the output data-type is
-        ``np.float64``.  Otherwise, the data-type of the output is the
+        or floats smaller than ``float32``, then the output data-type is
+        ``np.float32``.  Otherwise, the data-type of the output is the
         same as that of the input. If `out` is specified, that array is
         returned instead.
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 702b6504a7fc..edc2f2adf4c2 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -274,7 +274,7 @@ inline void SetDependency(const nnvm::NodeAttrs& attrs,
         LOG(FATAL) << "resource type not yet supported";
       }
     }
-    CHECK_LE(ntmp, 2) << "Only support 1 temp space request";
+    CHECK_LE(ntmp, 1) << "Only support 1 temp space request";
   }
 
   // append extra resource requests for storage fallback
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 063fe17c1c7c..ac1d1611dcff 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -25,30 +25,16 @@
 #ifndef MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_
 #define MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_
 
+#include <mshadow/tensor.h>
 #include <algorithm>
 #include <vector>
 #include <string>
-#include <mshadow/tensor.h>
 #include "../nn/moments-inl.h"
 #include "../tensor/broadcast_reduce_op.h"
 #include "../tensor/elemwise_binary_broadcast_op.h"
 #include "../tensor/ordering_op-inl.h"
 #include "../tensor/matrix_op-inl.h"
 
-#include <mxnet/operator_util.h>
-#include <dmlc/optional.h>
-#include <mshadow/tensor.h>
-#include <algorithm>
-#include <vector>
-#include <type_traits>
-#include "../mshadow_op.h"
-#include "../elemwise_op_common.h"
-#include "../tensor/sort_op.h"
-#include "../tensor/indexing_op.h"
-#include "/home/ubuntu/incubator-mxnet/3rdparty/mshadow/mshadow/extension/transpose.h"
-#include "/home/ubuntu/incubator-mxnet/3rdparty/mshadow/mshadow/extension/reshape.h"
-
-
 namespace mxnet {
 namespace op {
 
@@ -994,7 +980,6 @@ void NumpyMedianForward(const nnvm::NodeAttrs& attrs,
   }
 
   MSHADOW_TYPE_SWITCH(a.type_flag_, DType, {
-    using namespace mshadow;
     using namespace mshadow::expr;
     Tensor<xpu, 1, char> workspace;
     Tensor<xpu, 1, char> temp_workspace;
@@ -1037,12 +1022,10 @@ void NumpyMedianForward(const nnvm::NodeAttrs& attrs,
       trans_ptr = reinterpret_cast<DType*>(temp_mem.dptr_);
       sort_ptr = reinterpret_cast<DType*>(temp_mem.dptr_ + temp_data_size);
       idx_ptr = reinterpret_cast<index_t*>(temp_mem.dptr_ + 2 * temp_data_size);
-      //workspace_curr_ptr = reinterpret_cast<DType*>(temp_mem.dptr_ + temp_mem_size);
     } else {
       idx_ptr = reinterpret_cast<index_t*>(temp_mem.dptr_);
       trans_ptr = reinterpret_cast<DType*>(temp_mem.dptr_ + idx_size);
       sort_ptr = reinterpret_cast<DType*>(temp_mem.dptr_ + temp_data_size + idx_size);
-      //workspace_curr_ptr = reinterpret_cast<DType*>(temp_mem.dptr_ + temp_mem_size);
     }
     workspace_curr_ptr = temp_mem.dptr_ + 2 * temp_data_size + idx_size;
 
@@ -1059,18 +1042,14 @@ void NumpyMedianForward(const nnvm::NodeAttrs& attrs,
     TBlob a_sort = TBlob(sort_ptr, t_shape, xpu::kDevMask);
     TBlob a_idx = TBlob(idx_ptr, t_shape, xpu::kDevMask);
 
-    /*TopKImpl<xpu, DType, index_t>(ctx.run_ctx,
-                                  ctx.requested[1], {kWriteTo, kNullOp}, a_trans.reshape(t_shape),
-                                  {a_sort, a_idx},
-                                  topk_param);*/
-    // input
     std::vector<OpReqType> req_TopK = {kWriteTo, kNullOp};
     TBlob src = a_trans.reshape(t_shape);
     std::vector<TBlob> ret = {a_sort, a_idx};
     TopKParam parameter = topk_param;
 
     ParseTopKParam(src.shape_, parameter,
-                  &target_shape, &batch_size, &element_num, &axis_topk, &k, &do_transpose, &is_ascend);
+                  &target_shape, &batch_size, &element_num, &axis_topk,
+                  &k, &do_transpose, &is_ascend);
     CHECK_LE(element_num, mxnet::common::MaxIntegerValue<index_t>())
       << "'index_t' does not have a sufficient precision to represent "
       << "the indices of the input array. The total element_num is "
@@ -1079,61 +1058,53 @@ void NumpyMedianForward(const nnvm::NodeAttrs& attrs,
     Tensor<xpu, 3, DType> dat = src.FlatTo3D<xpu, DType>(axis_topk, axis_topk, s);
 
     sorted_dat = Tensor<xpu, 1, DType>(reinterpret_cast<DType*>(workspace_curr_ptr),
-      Shape1(src.Size()), s);  // contain sorted dat
-  workspace_curr_ptr += PadBytes(sizeof(DType) * src.Size(), alignment);
-  indices = Tensor<xpu, 1, index_t>(reinterpret_cast<index_t*>(workspace_curr_ptr),
-      Shape1(src.Size()), s);  // indices in the original matrix
-  workspace_curr_ptr += PadBytes(sizeof(index_t) * src.Size(), alignment);
-
-  if (parameter.ret_typ == topk_enum::kReturnMask) {
-    sel_indices = Tensor<xpu, 1, index_t>(reinterpret_cast<index_t*>(workspace_curr_ptr),
-                                      Shape1(batch_size * k), s);
-    workspace_curr_ptr += PadBytes(sizeof(index_t) * batch_size * k, alignment);
-    CHECK_EQ(sel_indices.CheckContiguous(), true);
-  }
+        Shape1(src.Size()), s);  // contain sorted dat
+    workspace_curr_ptr += PadBytes(sizeof(DType) * src.Size(), alignment);
+    indices = Tensor<xpu, 1, index_t>(reinterpret_cast<index_t*>(workspace_curr_ptr),
+        Shape1(src.Size()), s);  // indices in the original matrix
+    workspace_curr_ptr += PadBytes(sizeof(index_t) * src.Size(), alignment);
 
-  if (std::is_same<xpu, cpu>::value) {
-      Tensor<xpu, 1, DType> flattened_data;
-      if (do_transpose) {
-        flattened_data = Tensor<xpu, 1, DType>(reinterpret_cast<DType*>(workspace_curr_ptr),
-                                                Shape1(src.Size()), s);
-        workspace_curr_ptr += sizeof(DType) * src.Size();
-        flattened_data = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size()));
-        CHECK_EQ(flattened_data.CheckContiguous(), true);
-      } else {
-        flattened_data = src.FlatTo1D<xpu, DType>(s);
-      }
-      // `temp_workspace` stores the flattened data
-      temp_workspace = Tensor<xpu, 1, char>(reinterpret_cast<char*>(flattened_data.dptr_),
-                                            Shape1(sizeof(DType)*src.Size()), s);
-      CHECK_EQ(temp_workspace.CheckContiguous(), true);
-    } else {
-      if (do_transpose) {
-        sorted_dat = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size()));
+    if (parameter.ret_typ == topk_enum::kReturnMask) {
+      sel_indices = Tensor<xpu, 1, index_t>(reinterpret_cast<index_t*>(workspace_curr_ptr),
+                                        Shape1(batch_size * k), s);
+      workspace_curr_ptr += PadBytes(sizeof(index_t) * batch_size * k, alignment);
+      CHECK_EQ(sel_indices.CheckContiguous(), true);
+    }
+
+    if (std::is_same<xpu, cpu>::value) {
+        Tensor<xpu, 1, DType> flattened_data;
+        if (do_transpose) {
+          flattened_data = Tensor<xpu, 1, DType>(reinterpret_cast<DType*>(workspace_curr_ptr),
+                                                  Shape1(src.Size()), s);
+          workspace_curr_ptr += sizeof(DType) * src.Size();
+          flattened_data = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size()));
+          CHECK_EQ(flattened_data.CheckContiguous(), true);
+        } else {
+          flattened_data = src.FlatTo1D<xpu, DType>(s);
+        }
+        // `temp_workspace` stores the flattened data
+        temp_workspace = Tensor<xpu, 1, char>(reinterpret_cast<char*>(flattened_data.dptr_),
+                                              Shape1(sizeof(DType)*src.Size()), s);
+        CHECK_EQ(temp_workspace.CheckContiguous(), true);
       } else {
-        sorted_dat = reshape(dat, Shape1(src.Size()));
+        if (do_transpose) {
+          sorted_dat = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size()));
+        } else {
+          sorted_dat = reshape(dat, Shape1(src.Size()));
+        }
+        CHECK_EQ(sorted_dat.CheckContiguous(), true);
+        temp_workspace = Tensor<xpu, 1, char>(workspace_curr_ptr, Shape1(temp_size), s);
+        workspace_curr_ptr += temp_size;
       }
-      CHECK_EQ(sorted_dat.CheckContiguous(), true);
-      temp_workspace = Tensor<xpu, 1, char>(workspace_curr_ptr, Shape1(temp_size), s);  // temp space
-      workspace_curr_ptr += temp_size;
-    }
 
     mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size * element_num, 1, index_t{0}, index_t{1},
       kWriteTo, indices.dptr_);
     CHECK_EQ(indices.CheckContiguous(), true);
 
     // 2. Perform inplace batch sort.
-    // After sorting, each batch in `sorted_dat` will be sorted in the corresponding order
-    // up to the k-th element and the `indices` will contain the corresponding index in `sorted_dat`
-    // `temp_workspace` is used to store the flattend source data for CPU device, and it's used as
-    // a temporal buffer for GPU device.
     TopKSort(sorted_dat, indices, temp_workspace, k, element_num, is_ascend, s);
 
     // 3. Assign results to the ret blob
-    // When returning indices, only update(modulo) required elements instead of full elements
-    // to avoid redundant calculation.
-    // Cast `ret_indices` from int to real_t could introduce conversion error when the element_num
-    // is large enough.
     if (parameter.ret_typ == topk_enum::kReturnMask) {
       Tensor<xpu, 1, DType> ret_mask = ret[0].FlatTo1D<xpu, DType>(s);
       ret_mask = scalar<DType>(0);
@@ -1145,8 +1116,8 @@ void NumpyMedianForward(const nnvm::NodeAttrs& attrs,
       if (do_transpose) {
         mxnet::TShape src_shape = src.shape_.FlatTo3D(axis_topk);
         CHECK_EQ(sel_indices.CheckContiguous(), true);
-        sel_indices = transpose_indices(sel_indices, Shape3(src_shape[0], src_shape[2], src_shape[1]),
-                                        Shape3(0, 2, 1));
+        sel_indices = transpose_indices(sel_indices, Shape3(src_shape[0], src_shape[2],
+                                        src_shape[1]), Shape3(0, 2, 1));
       }
       if (req_TopK[0] == kNullOp) {
         return;
@@ -1158,7 +1129,8 @@ void NumpyMedianForward(const nnvm::NodeAttrs& attrs,
       }
     } else if (parameter.ret_typ == topk_enum::kReturnIndices) {
       if (do_transpose) {
-        Tensor<xpu, 3, index_t> ret_indices = ret[0].FlatTo3D<xpu, index_t>(axis_topk, axis_topk, s);
+        Tensor<xpu, 3, index_t> ret_indices = ret[0].FlatTo3D<xpu, index_t>(axis_topk,
+                                                                           axis_topk, s);
         ASSIGN_DISPATCH(ret_indices, req_TopK[0], tcast<index_t>(F<mshadow_op::mod>(transpose(
                         slice<2>(inplace_reshape(indices,
                                                 Shape3(ret_indices.shape_[0],
@@ -1176,11 +1148,12 @@ void NumpyMedianForward(const nnvm::NodeAttrs& attrs,
     } else {
       if (do_transpose) {
         Tensor<xpu, 3, DType> ret_value = ret[0].FlatTo3D<xpu, DType>(axis_topk, axis_topk, s);
-        Tensor<xpu, 3, index_t> ret_indices = ret[1].FlatTo3D<xpu, index_t>(axis_topk, axis_topk, s);
+        Tensor<xpu, 3, index_t> ret_indices = ret[1].FlatTo3D<xpu, index_t>(axis_topk,
+                                                                           axis_topk, s);
         ASSIGN_DISPATCH(ret_value, req_TopK[0], transpose(
                     slice<2>(inplace_reshape(sorted_dat,
-                                      Shape3(ret_value.shape_[0], ret_value.shape_[2], element_num)),
-                              0, k), Shape3(0, 2, 1)));
+                                      Shape3(ret_value.shape_[0], ret_value.shape_[2],
+                                      element_num)), 0, k), Shape3(0, 2, 1)));
         ASSIGN_DISPATCH(ret_indices, req_TopK[1], tcast<index_t>(F<mshadow_op::mod>(transpose(
                         slice<2>(inplace_reshape(indices,
                                                 Shape3(ret_indices.shape_[0],
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index 9eab4966eb97..de17858f3d87 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -327,8 +327,7 @@ inline bool NumpyMedianType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
 
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
   return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
 }
 
@@ -347,7 +346,7 @@ NNVM_REGISTER_OP(_npi_median)
 .set_attr<FCompute>("FCompute<cpu>", NumpyMedianForward<cpu>)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace, ResourceRequest::kTempSpace};
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
 // .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);