[MXNET-1413] Adding Large Tensor support for sort operators

apache · Jun 6, 2019 · ccb8d6c · ccb8d6c
1 parent 3f4f3d5
commit ccb8d6c
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 76 deletions.
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
@@ -425,6 +425,12 @@ def rand_coord_2d(x_low, x_high, y_low, y_high):
     return x, y
 
 
+def create_2d_tensor(rows, columns):
+    a = np.arange(0, rows).reshape(rows, 1)
+    b = np.broadcast_to(a, shape=(a.shape[0], columns))
+    return mx.nd.array(b, dtype=np.int64)
+
+
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):
     """Compatible reduce for old version of NumPy.
 

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
@@ -630,7 +630,7 @@ struct Kernel<OP, cpu> {
       }
     }
 #else
-    for (size_t i = 0; i < N; ++i) {
+    for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
       OP::Map(i, args...);
     }
 #endif

diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
@@ -83,6 +83,7 @@ struct TopKParam : public dmlc::Parameter<TopKParam> {
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("uint8", mshadow::kUint8)
     .add_enum("int32", mshadow::kInt32)
+    .add_enum("int64", mshadow::kInt64)
     .add_enum("float16", mshadow::kFloat16)
     .add_enum("float32", mshadow::kFloat32)
     .add_enum("float64", mshadow::kFloat64)
@@ -118,6 +119,7 @@ struct ArgSortParam : public dmlc::Parameter<ArgSortParam> {
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("uint8", mshadow::kUint8)
     .add_enum("int32", mshadow::kInt32)
+    .add_enum("int64", mshadow::kInt64)
     .add_enum("float16", mshadow::kFloat16)
     .add_enum("float32", mshadow::kFloat32)
     .add_enum("float64", mshadow::kFloat64)
@@ -129,8 +131,8 @@ struct ArgSortParam : public dmlc::Parameter<ArgSortParam> {
 };
 
 inline void ParseTopKParam(const mxnet::TShape& src_shape, const TopKParam& param,
-                           mxnet::TShape *target_shape, int *batch_size, int *element_num,
-                           int *axis, int *k, bool *do_transpose, bool *is_ascend) {
+                           mxnet::TShape *target_shape, size_t *batch_size, index_t *element_num,
+                           int *axis, index_t *k, bool *do_transpose, bool *is_ascend) {
   *do_transpose = false;
   *k = param.k;
   *is_ascend = param.is_ascend;
@@ -179,54 +181,54 @@ using namespace mshadow;
 
 struct fill_ind_to_one {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, const int* indices, DType* out) {
+  MSHADOW_XINLINE static void Map(index_t i, const index_t* indices, DType* out) {
     out[indices[i]] = static_cast<DType>(1);
   }
 };
 
 struct fill_ind {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, const int* indices, const DType* val,
+  MSHADOW_XINLINE static void Map(index_t i, const index_t* indices, const DType* val,
                                   int req, DType* out) {
     KERNEL_ASSIGN(out[indices[i]], req, val[i]);
   }
 };
 
 template<typename DType>
 MSHADOW_FORCE_INLINE void TopKSort(const Tensor<cpu, 1, DType>& dat,
-                                   const Tensor<cpu, 1, int>& ind,
+                                   const Tensor<cpu, 1, index_t>& ind,
                                    const Tensor<cpu, 1, char>& work,
-                                   int K, int N, bool is_ascend,
+                                   index_t K, index_t N, bool is_ascend,
                                    Stream<cpu> *s) {
   // Use full sort when K is relatively large.
   const bool full_sort(K*8 > N);
   // Batch size.
   const int M(work.size(0)/(sizeof(DType)*N));
   const int omp_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount());
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < M; ++i) {
+  for (index_t i = 0; i < M; ++i) {
     // Tensor `work` stores the flattened source data, while `dat` stores the sorted result.
     DType *vals = reinterpret_cast<DType*>(work.dptr_);
     DType *sorted_vals = dat.dptr_+i*N;
-    int *indices = ind.dptr_+i*N;
+    index_t *indices = ind.dptr_+i*N;
     if (is_ascend) {
       if (full_sort) {
         std::sort(indices, indices+N,
                   [&](const int& i1, const int& i2){ return vals[i1] < vals[i2]; });
       } else {
         std::partial_sort(indices, indices+K, indices+N,
-                          [&](const int& i1, const int& i2){ return vals[i1] < vals[i2]; });
+                          [&](const index_t& i1, const index_t& i2){ return vals[i1] < vals[i2]; });
       }
     } else {
       if (full_sort) {
         std::sort(indices, indices+N,
-                  [&](const int& i1, const int& i2){ return vals[i1] > vals[i2]; });
+                  [&](const index_t& i1, const index_t& i2){ return vals[i1] > vals[i2]; });
       } else {
         std::partial_sort(indices, indices+K, indices+N,
-                          [&](const int& i1, const int& i2){ return vals[i1] > vals[i2]; });
+                          [&](const index_t& i1, const index_t& i2){ return vals[i1] > vals[i2]; });
       }
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       sorted_vals[j] = vals[indices[j]];
     }
   }
@@ -364,9 +366,8 @@ MSHADOW_FORCE_INLINE void TopKSort(const Tensor<gpu, 1, DType>& dat,
    * \param param the topk parameters
    * \tparam xpu the device type.
    * \tparam DType type of the output value/mask.
-   * \tparam IDType type of the output indices.
    */
-template<typename xpu, typename DType, typename IDType>
+template<typename xpu, typename DType>
 void TopKImpl(const RunContext &ctx,
               const Resource &resource,
               const std::vector<OpReqType>& req,
@@ -380,20 +381,22 @@ void TopKImpl(const RunContext &ctx,
   Tensor<xpu, 1, char> workspace;
   Tensor<xpu, 1, char> temp_workspace;
   Tensor<xpu, 1, DType> sorted_dat;
-  Tensor<xpu, 1, int> indices, sel_indices;
-  int batch_size, element_num;  // number of batches + the size of each batch
+  Tensor<xpu, 1, index_t> indices, sel_indices;
+  size_t batch_size;
+  index_t element_num;  // number of batches + the size of each batch
   int axis = 0;
   bool do_transpose = false;
   bool is_ascend = false;
-  int k = 0;
+  index_t k = 0;
   size_t alignment = std::max(sizeof(DType), sizeof(int));
   mxnet::TShape target_shape;
   ParseTopKParam(src.shape_, param,
                  &target_shape, &batch_size, &element_num, &axis, &k, &do_transpose, &is_ascend);
-  CHECK_LE(element_num, mxnet::common::MaxIntegerValue<IDType>())
-    << "'IDType' does not have a sufficient precision to represent the indices of the input array. "
-    << "The total element_num is " << element_num << ", but the selected IDType can only represent "
-    << mxnet::common::MaxIntegerValue<IDType>() << " elements";
+  CHECK_LE(element_num, mxnet::common::MaxIntegerValue<index_t>())
+    << "'index_t' does not have a sufficient precision to represent the indices of "
+    << "the input array. The total element_num is "
+    << element_num << ", but the selected index_t can only represent "
+    << mxnet::common::MaxIntegerValue<index_t>() << " elements";
   Tensor<xpu, 3, DType> dat = src.FlatTo3D<xpu, DType>(axis, axis, s);
   size_t temp_size = 0;
   // Temp space needed by the gpu-based full sorts.
@@ -417,12 +420,12 @@ void TopKImpl(const RunContext &ctx,
   sorted_dat = Tensor<xpu, 1, DType>(reinterpret_cast<DType*>(workspace_curr_ptr),
                                       Shape1(src.Size()), s);  // contain sorted dat
   workspace_curr_ptr += PadBytes(sizeof(DType) * src.Size(), alignment);
-  indices = Tensor<xpu, 1, int>(reinterpret_cast<int*>(workspace_curr_ptr),
+  indices = Tensor<xpu, 1, index_t>(reinterpret_cast<index_t*>(workspace_curr_ptr),
                                 Shape1(src.Size()), s);  // indices in the original matrix
   workspace_curr_ptr += PadBytes(sizeof(int) * src.Size(), alignment);
 
   if (param.ret_typ == topk_enum::kReturnMask) {
-    sel_indices = Tensor<xpu, 1, int>(reinterpret_cast<int*>(workspace_curr_ptr),
+    sel_indices = Tensor<xpu, 1, index_t>(reinterpret_cast<index_t*>(workspace_curr_ptr),
                                       Shape1(batch_size * k), s);
     workspace_curr_ptr += PadBytes(sizeof(int) * batch_size * k, alignment);
     CHECK_EQ(sel_indices.CheckContiguous(), true);
@@ -454,8 +457,8 @@ void TopKImpl(const RunContext &ctx,
     workspace_curr_ptr += temp_size;
   }
 
-  mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size * element_num, 1, 0, 1,
-    kWriteTo, indices.dptr_);
+  mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size * element_num, 1, static_cast<index_t>(0),
+                                           static_cast<index_t>(1), kWriteTo, indices.dptr_);
   CHECK_EQ(indices.CheckContiguous(), true);
 
   // 2. Perform inplace batch sort.
@@ -494,30 +497,30 @@ void TopKImpl(const RunContext &ctx,
     }
   } else if (param.ret_typ == topk_enum::kReturnIndices) {
     if (do_transpose) {
-      Tensor<xpu, 3, IDType> ret_indices = ret[0].FlatTo3D<xpu, IDType>(axis, axis, s);
-      ASSIGN_DISPATCH(ret_indices, req[0], tcast<IDType>(F<mshadow_op::mod>(transpose(
+      Tensor<xpu, 3, index_t> ret_indices = ret[0].FlatTo3D<xpu, index_t>(axis, axis, s);
+      ASSIGN_DISPATCH(ret_indices, req[0], tcast<index_t>(F<mshadow_op::mod>(transpose(
                       slice<2>(inplace_reshape(indices,
                                                Shape3(ret_indices.shape_[0],
                                                       ret_indices.shape_[2],
                                                       element_num)),
                                0, k),
                       Shape3(0, 2, 1)), element_num)));
     } else {
-      Tensor<xpu, 2, IDType> ret_indices =
-        ret[0].get_with_shape<xpu, 2, IDType>(Shape2(batch_size, k), s);
-      ASSIGN_DISPATCH(ret_indices, req[0], tcast<IDType>(F<mshadow_op::mod>(slice<1>(
+      Tensor<xpu, 2, index_t> ret_indices =
+        ret[0].get_with_shape<xpu, 2, index_t>(Shape2(batch_size, k), s);
+      ASSIGN_DISPATCH(ret_indices, req[0], tcast<index_t>(F<mshadow_op::mod>(slice<1>(
                       inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k),
                       element_num)));
     }
   } else {
     if (do_transpose) {
       Tensor<xpu, 3, DType> ret_value = ret[0].FlatTo3D<xpu, DType>(axis, axis, s);
-      Tensor<xpu, 3, IDType> ret_indices = ret[1].FlatTo3D<xpu, IDType>(axis, axis, s);
+      Tensor<xpu, 3, index_t> ret_indices = ret[1].FlatTo3D<xpu, index_t>(axis, axis, s);
       ASSIGN_DISPATCH(ret_value, req[0], transpose(
                    slice<2>(inplace_reshape(sorted_dat,
                                     Shape3(ret_value.shape_[0], ret_value.shape_[2], element_num)),
                             0, k), Shape3(0, 2, 1)));
-      ASSIGN_DISPATCH(ret_indices, req[1], tcast<IDType>(F<mshadow_op::mod>(transpose(
+      ASSIGN_DISPATCH(ret_indices, req[1], tcast<index_t>(F<mshadow_op::mod>(transpose(
                       slice<2>(inplace_reshape(indices,
                                                Shape3(ret_indices.shape_[0],
                                                       ret_indices.shape_[2],
@@ -526,11 +529,11 @@ void TopKImpl(const RunContext &ctx,
     } else {
       Tensor<xpu, 2, DType> ret_value =
         ret[0].get_with_shape<xpu, 2, DType>(Shape2(batch_size, k), s);
-      Tensor<xpu, 2, IDType> ret_indices =
-        ret[1].get_with_shape<xpu, 2, IDType>(Shape2(batch_size, k), s);
+      Tensor<xpu, 2, index_t> ret_indices =
+        ret[1].get_with_shape<xpu, 2, index_t>(Shape2(batch_size, k), s);
       ASSIGN_DISPATCH(ret_value, req[0],
              slice<1>(inplace_reshape(sorted_dat, Shape2(batch_size, element_num)), 0, k));
-      ASSIGN_DISPATCH(ret_indices, req[1], tcast<IDType>(F<mshadow_op::mod>(slice<1>(
+      ASSIGN_DISPATCH(ret_indices, req[1], tcast<index_t>(F<mshadow_op::mod>(slice<1>(
                  inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k), element_num)));
     }
   }
@@ -545,13 +548,13 @@ void TopK(const nnvm::NodeAttrs& attrs,
   const TopKParam& param = nnvm::get<TopKParam>(attrs.parsed);
   if (param.ret_typ == topk_enum::kReturnIndices || param.ret_typ == topk_enum::kReturnBoth) {
     MXNET_NO_FLOAT16_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      MSHADOW_TYPE_SWITCH(param.dtype, IDType, {
-        TopKImpl<xpu, DType, IDType>(ctx.run_ctx, ctx.requested[0], req, inputs[0], outputs, param);
+      MSHADOW_TYPE_SWITCH(param.dtype, index_t, {
+        TopKImpl<xpu, DType>(ctx.run_ctx, ctx.requested[0], req, inputs[0], outputs, param);
       })
     });
   } else {
     MXNET_NO_FLOAT16_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      TopKImpl<xpu, DType, int>(ctx.run_ctx, ctx.requested[0], req, inputs[0], outputs, param);
+      TopKImpl<xpu, DType>(ctx.run_ctx, ctx.requested[0], req, inputs[0], outputs, param);
     });
   }
 }
@@ -569,7 +572,7 @@ void Sort(const nnvm::NodeAttrs& attrs,
   topk_param.k = 0;
   topk_param.ret_typ = topk_enum::kReturnValue;
   MXNET_NO_FLOAT16_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    TopKImpl<xpu, DType, int>(ctx.run_ctx, ctx.requested[0], req, inputs[0], outputs, topk_param);
+    TopKImpl<xpu, DType>(ctx.run_ctx, ctx.requested[0], req, inputs[0], outputs, topk_param);
   });
 }
 
@@ -587,14 +590,14 @@ void ArgSort(const nnvm::NodeAttrs& attrs,
   topk_param.dtype = param.dtype;
   topk_param.ret_typ = topk_enum::kReturnIndices;
   MXNET_NO_FLOAT16_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(param.dtype, IDType, {
-      TopKImpl<xpu, DType, IDType>(ctx.run_ctx,
+    MSHADOW_TYPE_SWITCH(param.dtype, index_t, {
+      TopKImpl<xpu, DType>(ctx.run_ctx,
                                    ctx.requested[0], req, inputs[0], outputs, topk_param);
     });
   });
 }
 
-template<typename xpu, typename DType, typename IDType>
+template<typename xpu, typename DType>
 void TopKBackwardImpl(const OpContext &ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
@@ -605,47 +608,49 @@ void TopKBackwardImpl(const OpContext &ctx,
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.run_ctx.get_stream<xpu>();
   CHECK(param.ret_typ == topk_enum::kReturnValue || param.ret_typ == topk_enum::kReturnBoth);
-  int batch_size, element_num;  // number of batches + the size of each batch
+  size_t batch_size;
+  index_t element_num;  // number of batches + the size of each batch
   int axis = 0;
   bool do_transpose = false;
   bool is_ascend = false;
-  int k = 0;
+  index_t k = 0;
   mxnet::TShape target_shape;
   ParseTopKParam(outputs[0].shape_, param,
                  &target_shape, &batch_size, &element_num, &axis, &k, &do_transpose, &is_ascend);
-  CHECK_LE(element_num, mxnet::common::MaxIntegerValue<IDType>())
-    << "'IDType' does not have a sufficient precision to represent the indices of the input array. "
-    << "The total element_num is " << element_num << ", but the selected IDType can only represent "
-    << mxnet::common::MaxIntegerValue<IDType>() << " elements";
-  Tensor<xpu, 1, int> workspace =
-    ctx.requested[0].get_space_typed<xpu, 1, int>(Shape1(batch_size * k + batch_size), s);
-  Tensor<xpu, 1, int> sel_indices =
-    Tensor<xpu, 1, int>(workspace.dptr_, Shape1(batch_size * k), s);
-  Tensor<xpu, 1, int> batch_shift =
-    Tensor<xpu, 1, int>(workspace.dptr_ + batch_size * k, Shape1(batch_size), s);
+  CHECK_LE(element_num, mxnet::common::MaxIntegerValue<index_t>())
+    << "'index_t' does not have a sufficient precision to represent "
+    << "the indices of the input array. The total element_num is "
+    << element_num << ", but the selected index_t can only represent "
+    << mxnet::common::MaxIntegerValue<index_t>() << " elements";
+  Tensor<xpu, 1, index_t> workspace =
+    ctx.requested[0].get_space_typed<xpu, 1, index_t>(Shape1(batch_size * k + batch_size), s);
+  Tensor<xpu, 1, index_t> sel_indices =
+    Tensor<xpu, 1, index_t>(workspace.dptr_, Shape1(batch_size * k), s);
+  Tensor<xpu, 1, index_t> batch_shift =
+    Tensor<xpu, 1, index_t>(workspace.dptr_ + batch_size * k, Shape1(batch_size), s);
 
   Tensor<xpu, 2, DType> out_grad =
     inputs[0].get_with_shape<xpu, 2, DType>(Shape2(inputs[0].shape_.Size(), 1), s);
   Tensor<xpu, 2, DType> in_grad =
     outputs[0].get_with_shape<xpu, 2, DType>(Shape2(outputs[0].shape_.Size(), 1), s);
-  mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size, 1, 0, element_num, kWriteTo,
+  mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size, 1, static_cast<index_t>(0), element_num, kWriteTo,
                                            batch_shift.dptr_);
   if (do_transpose) {
-    Tensor<xpu, 1, IDType> indices = inputs[2].FlatTo1D<xpu, IDType>(s);
+    Tensor<xpu, 1, index_t> indices = inputs[2].FlatTo1D<xpu, index_t>(s);
     mxnet::TShape src_shape = outputs[0].shape_.FlatTo3D(axis);
     sel_indices = reshape(transpose(
                             broadcast_to(inplace_reshape(batch_shift,
                                                          Shape3(src_shape[0], src_shape[2], 1)),
                                          mxnet::TShape(Shape3(src_shape[0], src_shape[2], k))),
                             Shape3(0, 2, 1)),
                           Shape1(batch_size * k));
-    sel_indices += tcast<int>(indices);
+    sel_indices += indices;
     sel_indices = transpose_indices(sel_indices, Shape3(src_shape[0], src_shape[2], src_shape[1]),
                                     Shape3(0, 2, 1));
   } else {
-    Tensor<xpu, 2, IDType> indices =
-      inputs[2].get_with_shape<xpu, 2, IDType>(Shape2(batch_size, k), s);
-    sel_indices = reshape(tcast<int>(indices) +
+    Tensor<xpu, 2, index_t> indices =
+      inputs[2].get_with_shape<xpu, 2, index_t>(Shape2(batch_size, k), s);
+    sel_indices = reshape(indices +
                           broadcast_to(inplace_reshape(batch_shift, Shape2(batch_size, 1)),
                                        mxnet::TShape(Shape2(batch_size, k))),
                           Shape1(batch_size * k));
@@ -674,13 +679,13 @@ void TopKBackward_(const nnvm::NodeAttrs& attrs,
   const TopKParam& param = nnvm::get<TopKParam>(attrs.parsed);
   if (param.ret_typ == topk_enum::kReturnBoth) {
     MXNET_NO_FLOAT16_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      MSHADOW_TYPE_SWITCH(param.dtype, IDType, {
-        TopKBackwardImpl<xpu, DType, IDType>(ctx, inputs, req, outputs, param);
+      MSHADOW_TYPE_SWITCH(param.dtype, index_t, {
+        TopKBackwardImpl<xpu, DType>(ctx, inputs, req, outputs, param);
       });
     });
   } else if (param.ret_typ == topk_enum::kReturnValue) {
     MXNET_NO_FLOAT16_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      TopKBackwardImpl<xpu, DType, int>(ctx, inputs, req, outputs, param);
+      TopKBackwardImpl<xpu, DType>(ctx, inputs, req, outputs, param);
     });
   } else {
     LOG(FATAL) << "Not Implemented";
@@ -717,7 +722,7 @@ inline bool TopKType(const nnvm::NodeAttrs& attrs,
   CHECK(out_size == 1 || out_size == 2);
   if (out_size > 1) {
     if (param.ret_typ == topk_enum::kReturnValue) {
-      CHECK(type_assign(&(*out_attrs)[1], mshadow::kInt32))
+      CHECK(type_assign(&(*out_attrs)[1], mshadow::kInt64))
         << "Failed to set the type of ret_indices.";
     } else {
       CHECK(type_assign(&(*out_attrs)[1], param.dtype))
@@ -752,11 +757,12 @@ inline bool TopKShapeImpl(const TopKParam& param,
     CHECK_EQ(out_attrs->size(), 2U);
   }
   mxnet::TShape& in_shape = (*in_attrs)[0];
-  int batch_size, element_num;  // number of batches + the size of each batch
+  size_t batch_size;
+  index_t element_num;  // number of batches + the size of each batch
   int axis = 0;
   bool do_transpose = false;
   bool is_ascend = false;
-  int k = 0;
+  index_t k = 0;
   mxnet::TShape target_shape;
   ParseTopKParam(in_shape, param,
     &target_shape, &batch_size, &element_num, &axis, &k, &do_transpose, &is_ascend);
@@ -785,8 +791,8 @@ inline bool SortType(const nnvm::NodeAttrs& attrs,
   size_t out_size = out_attrs->size();
   CHECK_EQ(in_size, 1);
   CHECK_EQ(out_size, 2);
-  CHECK(type_assign(&(*out_attrs)[1], mshadow::kInt32))
-          << "Failed to set the type of ret_indices to int32.";
+  CHECK(type_assign(&(*out_attrs)[1], mshadow::kInt64))
+          << "Failed to set the type of ret_indices to int64.";
   CHECK(type_assign(&data_type, (*in_attrs)[0])) << "Incompatible dtype of input, in_attrs[0]="
                                                  << (*in_attrs)[0];
   CHECK(type_assign(&data_type, (*out_attrs)[0])) << "Incompatible dtype of output, out_attrs[0]="