apache · eric-haibin-lin · Apr 23, 2019 · Mar 29, 2019 · Mar 29, 2019 · Apr 2, 2019
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/mshadow"]
 	path = 3rdparty/mshadow
-	url = https://github.com/dmlc/mshadow.git
+	url = https://github.com/apeforest/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
@@ -50,6 +50,7 @@ mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
 mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT." OFF)
 mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
 mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
+mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
 
 message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
 message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
@@ -295,6 +296,13 @@ else()
   add_definitions(-DMXNET_USE_NCCL=0)
 endif()
 
+if (USE_INT64_TENSOR_SIZE)
+  message(STATUS "Using 64-bit integer for tensor size")
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=1)
+else()
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=0)
+endif()
+
 include(cmake/ChooseBlas.cmake)
 if(USE_CUDA AND FIRST_CUDA)
   include(3rdparty/mshadow/cmake/Utils.cmake)

@@ -188,6 +188,11 @@ ifeq ($(USE_OPERATOR_TUNING), 1)
 	CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
 endif
 
+ifeq ($(USE_INT64_TENSOR_SIZE), 1)
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=1
+else
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=0
+endif
 # verify existence of separate lapack library when using blas/openblas/atlas
 # switch off lapack support in case it can't be found
 # issue covered with this

@@ -219,15 +219,16 @@ class TBlob {
     return shape_.ndim();
   }
   /*!
-   * \brief return size of i-th dimension, start counting from highest dimension
+   * \brief return size of i-th dimension, start counting from highest dimension.
+   * return type needs to be a signed integer.
    * \param idx the dimension count from the highest dimensin
-   * \return the size
+   * \return the size. -1 means unknown size to support zero-size tensor.
    */
   inline index_t size(index_t idx) const {
     return shape_[idx];
   }
   /*! \brief total number of elements in the tensor */
-  inline index_t Size(void) const {
+  inline size_t Size(void) const {
     return shape_.Size();
   }
   /*! \brief get pointer in dtype */
@@ -442,7 +443,7 @@ class FieldEntry<mxnet::TShape>
         throw dmlc::ParamError(os.str());
     }
     if (enforce_nonzero_) {
-      for (mxnet::index_t i = 0; i < v.ndim(); ++i) {
+      for (int i = 0; i < v.ndim(); ++i) {
         if (v[i] == 0U) {
           std::ostringstream os;
           os << "value " << v << "for Parameter " << this->key_
@@ -456,7 +457,7 @@ class FieldEntry<mxnet::TShape>
     this->enforce_nonzero_ = true;
     return this->self();
   }
-  inline FieldEntry<mxnet::TShape> &set_expect_ndim(mxnet::index_t ndim) {
+  inline FieldEntry<mxnet::TShape> &set_expect_ndim(int ndim) {
     expect_ndim_ = ndim;
     return this->self();
   }
@@ -465,7 +466,7 @@ class FieldEntry<mxnet::TShape>
   // whether all the entries need to be nonzero
   bool enforce_nonzero_;
   // expected number of dimension, default = 0 means no restriction.
-  mxnet::index_t expect_ndim_;
+  int expect_ndim_;
 };
 
 }  // namespace parameter

@@ -177,7 +177,7 @@ class Tuple {
     return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
   }
   /*! \return number of dimension of the tuple */
-  inline uint32_t ndim() const {
+  inline int ndim() const {
     return ndim_;
   }
   /*!
@@ -316,17 +316,17 @@ class Tuple {
 
  protected:
   // stack cache size
-  static const uint32_t kStackCache = 4;
+  static const int kStackCache = 4;
   /*! \brief number of dimension of the tuple */
-  uint32_t ndim_{0};
+  int ndim_{0};
   /*! \brief number of cells allocated in data_heap_ */
-  uint32_t num_heap_allocated_{0};
+  int num_heap_allocated_{0};
   /*! \brief in stack space used to store shape when it is small */
   ValueType data_stack_[kStackCache];
   /*! \brief space to store shape when dimension is big*/
   ValueType* data_heap_{nullptr};
   // internal function to change the dimension
-  inline void SetDim(uint32_t ndim) {
+  inline void SetDim(int ndim) {
     if (ndim > kStackCache &&
         ndim > num_heap_allocated_) {
       delete [] data_heap_;
@@ -348,7 +348,7 @@ class TShape : public Tuple<dim_t> {
    * constructor to construct a shape with all 1.
    * \param ndim the number of dimension
    */
-  inline TShape(uint32_t ndim) {  // NOLINT(*)
+  inline TShape(int ndim) {  // NOLINT(*)
     this->SetDim(ndim);
     std::fill_n(begin(), ndim, 1);
   }
@@ -460,7 +460,7 @@ class TShape : public Tuple<dim_t> {
    */
   template<int dim>
   inline mshadow::Shape<dim> get() const {
-    CHECK_EQ(dim, static_cast<int>(ndim()))
+    CHECK_EQ(dim, ndim())
         << "dimension do not match target dimension " << dim << " vs " << ndim();
     const dim_t *d = this->data();
     mshadow::Shape<dim> s;
@@ -479,7 +479,7 @@ class TShape : public Tuple<dim_t> {
     const dim_t *d = this->data();
     s.shape_[1] = d[ndim() - 1];
     dim_t ymax = 1;
-    for (size_t i = 1; i < ndim(); ++i) {
+    for (int i = 1; i < ndim(); ++i) {
       ymax *= d[i - 1];
     }
     s.shape_[0] = ymax;
@@ -491,7 +491,7 @@ class TShape : public Tuple<dim_t> {
    * \param axis_end The ending axis specified.
    * \return the flat 3d shape
    */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
+  inline mshadow::Shape<3> FlatTo3D(int axis_begin, int axis_end) const {
     CHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
     if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
@@ -500,13 +500,13 @@ class TShape : public Tuple<dim_t> {
     s.shape_[1] = 1;
     s.shape_[2] = 1;
 
-    for (size_t i = 0; i < axis_begin; ++i) {
+    for (int i = 0; i < axis_begin; ++i) {
       s.shape_[0] *= d[i];
     }
-    for (size_t i = axis_begin; i <= axis_end; ++i) {
+    for (int i = axis_begin; i <= axis_end; ++i) {
       s.shape_[1] *= d[i];
     }
-    for (size_t i = axis_end + 1; i < ndim(); ++i) {
+    for (int i = axis_end + 1; i < ndim(); ++i) {
       s.shape_[2] *= d[i];
     }
     return s;
@@ -516,7 +516,7 @@ class TShape : public Tuple<dim_t> {
    * \param axis The axis specified.
    * \return the flat 3d shape
    */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis) const {
+  inline mshadow::Shape<3> FlatTo3D(int axis) const {
     return FlatTo3D(axis, axis);
   }
   inline bool operator==(const TShape &s) const {
@@ -611,9 +611,9 @@ template<typename T>
 struct hash<mxnet::Tuple<T> > {
   /*! \brief hash a Tuple into unsigned int */
   size_t operator()(const mxnet::Tuple<T>& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
+    std::hash<int> hash_int;
+    size_t res = hash_int(val.ndim());
+    for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
     return res;
@@ -625,9 +625,9 @@ template<>
 struct hash<mxnet::TShape> {
   /*! \brief hash a TShape into unsigned int */
   size_t operator()(const mxnet::TShape& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
+    std::hash<int> hash_int;
+    size_t res = hash_int(val.ndim());
+    for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
     return res;

@@ -214,6 +214,12 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type to represent the total number of elements in a tensor
+# This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------

@@ -191,6 +191,12 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type to represent the total number of elements in the tensor
+# This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------

@@ -135,6 +135,12 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type to represent the total number of elements in a tensor
+# This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------

diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
@@ -336,7 +336,7 @@ class ConvolutionV1Op : public Operator {
     // param_.workspace is in elements of sizeof(DType)
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
     nstep_ = std::max<index_t>(
-        std::min(static_cast<index_t>(param_.workspace) /
+        std::min<index_t>(param_.workspace /
           (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
       1);
 

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
@@ -460,7 +460,7 @@ class DeconvolutionOp {
                                      oshape[2] * oshape[3]);
     // See convolution for workspace calculations. nstep_ will be the effective batch size
     nstep_ = std::max<index_t>(
-        std::min(static_cast<index_t>(param_.workspace) /
+        std::min<index_t>(param_.workspace /
           (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
       1);
 

diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
@@ -165,7 +165,7 @@ inline bool shape_assign(mxnet::TShape *y, const mxnet::TShape& x) {
   } else if (y->ndim() != x.ndim()) {
     return x.ndim() == 0;
   } else {
-    for (size_t i = 0; i < y->ndim(); ++i) {
+    for (int i = 0; i < y->ndim(); ++i) {
       if ((*y)[i] == 0) {
         (*y)[i] = x[i];
       } else if ((*y)[i] != x[i] && x[i] != 0) {
@@ -563,7 +563,7 @@ class OpSignature {
   }
 
   void AddSign(const mxnet::TShape &shape) {
-    for (size_t i = 0; i < shape.ndim(); i++) {
+    for (int i = 0; i < shape.ndim(); i++) {
       hash = hash * 2 + shape[i];
       eles.push_back(shape[i]);
     }