From 41351f3528c15036e7324611d9e90b4d232819fb Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 29 Mar 2019 10:41:12 -0700
Subject: [PATCH 01/30] use a compile flag to use int64 tensor size

---
 3rdparty/mshadow                    | 2 +-
 Makefile                            | 5 +++++
 make/config.mk                      | 4 ++++
 make/crosscompile.jetson.mk         | 4 ++++
 make/osx.mk                         | 4 ++++
 src/operator/convolution_v1-inl.h   | 2 +-
 src/operator/nn/deconvolution-inl.h | 2 +-
 7 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 95ebe0f109ae..0ea71658816f 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 95ebe0f109ae021d0d66e2a627ccfc55c3253b55
+Subproject commit 0ea71658816fa039b312984d186190a6c4e764c8
diff --git a/Makefile b/Makefile
index 8ca708018b13..a84e22ff0e9b 100644
--- a/Makefile
+++ b/Makefile
@@ -188,6 +188,11 @@ ifeq ($(USE_OPERATOR_TUNING), 1)
 	CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
 endif
 
+ifeq ($(USE_INT64_TENSOR_SIZE), 1)
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=1
+else
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=0
+endif
 # verify existence of separate lapack library when using blas/openblas/atlas
 # switch off lapack support in case it can't be found
 # issue covered with this
diff --git a/make/config.mk b/make/config.mk
index f9ac4cf10048..d0c4350f8b23 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -214,6 +214,10 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type for tensor size.
+# This will cause performance degradation reported in issue #14496
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index 3db2b98847f3..1cc57c4e9b45 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -191,6 +191,10 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type for tensor size.
+# This will cause performance degradation reported in issue #14496
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------
diff --git a/make/osx.mk b/make/osx.mk
index 7e32d81a5d71..63c113565724 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -135,6 +135,10 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type for tensor size.
+# This will cause performance degradation reported in issue #14496
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index ed6748a9c85c..d74e2896e10c 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -336,7 +336,7 @@ class ConvolutionV1Op : public Operator {
     // param_.workspace is in elements of sizeof(DType)
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
     nstep_ = std::max<index_t>(
-        std::min(static_cast<index_t>(param_.workspace) /
+        std::min<index_t>(param_.workspace /
           (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
       1);
 
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 5248c1211ac7..b0efe3ffbbcd 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -460,7 +460,7 @@ class DeconvolutionOp {
                                      oshape[2] * oshape[3]);
     // See convolution for workspace calculations. nstep_ will be the effective batch size
     nstep_ = std::max<index_t>(
-        std::min(static_cast<index_t>(param_.workspace) /
+        std::min<index_t>(param_.workspace /
           (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
       1);
 

From e9bd3ccad3b0370329338af806a3b51db25a03a4 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 29 Mar 2019 12:52:19 -0700
Subject: [PATCH 02/30] use personal mshadow repo

---
 .gitmodules      | 2 +-
 3rdparty/mshadow | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 836d824a6f5a..0945e8f4a6c2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/mshadow"]
 	path = 3rdparty/mshadow
-	url = https://github.com/dmlc/mshadow.git
+	url = https://github.com/apeforest/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 0ea71658816f..0c3a6e73dc1f 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 0ea71658816fa039b312984d186190a6c4e764c8
+Subproject commit 0c3a6e73dc1fc611cf30e67e9e5e53cf04fa4369

From caf8e7f7d2b4f3f8dcb89aebfb4da596993aba6f Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 2 Apr 2019 10:33:58 -0700
Subject: [PATCH 03/30] update data type

---
 3rdparty/mshadow               |  2 +-
 include/mxnet/tensor_blob.h    | 10 ++++-----
 include/mxnet/tuple.h          | 38 +++++++++++++++++-----------------
 src/operator/operator_common.h |  4 ++--
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 0c3a6e73dc1f..655a9923e40b 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 0c3a6e73dc1fc611cf30e67e9e5e53cf04fa4369
+Subproject commit 655a9923e40bbd60e0a640b0b115fabf8bf75e49
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 7d059025b03e..87bdec95a7a5 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -223,11 +223,11 @@ class TBlob {
    * \param idx the dimension count from the highest dimensin
    * \return the size
    */
-  inline index_t size(index_t idx) const {
+  inline size_t size(index_t idx) const {
     return shape_[idx];
   }
   /*! \brief total number of elements in the tensor */
-  inline index_t Size(void) const {
+  inline size_t Size(void) const {
     return shape_.Size();
   }
   /*! \brief get pointer in dtype */
@@ -442,7 +442,7 @@ class FieldEntry<mxnet::TShape>
         throw dmlc::ParamError(os.str());
     }
     if (enforce_nonzero_) {
-      for (mxnet::index_t i = 0; i < v.ndim(); ++i) {
+      for (int i = 0; i < v.ndim(); ++i) {
         if (v[i] == 0U) {
           std::ostringstream os;
           os << "value " << v << "for Parameter " << this->key_
@@ -456,7 +456,7 @@ class FieldEntry<mxnet::TShape>
     this->enforce_nonzero_ = true;
     return this->self();
   }
-  inline FieldEntry<mxnet::TShape> &set_expect_ndim(mxnet::index_t ndim) {
+  inline FieldEntry<mxnet::TShape> &set_expect_ndim(int ndim) {
     expect_ndim_ = ndim;
     return this->self();
   }
@@ -465,7 +465,7 @@ class FieldEntry<mxnet::TShape>
   // whether all the entries need to be nonzero
   bool enforce_nonzero_;
   // expected number of dimension, default = 0 means no restriction.
-  mxnet::index_t expect_ndim_;
+  int expect_ndim_;
 };
 
 }  // namespace parameter
diff --git a/include/mxnet/tuple.h b/include/mxnet/tuple.h
index 7c1367333630..74b810b27263 100644
--- a/include/mxnet/tuple.h
+++ b/include/mxnet/tuple.h
@@ -177,7 +177,7 @@ class Tuple {
     return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
   }
   /*! \return number of dimension of the tuple */
-  inline uint32_t ndim() const {
+  inline int ndim() const {
     return ndim_;
   }
   /*!
@@ -316,17 +316,17 @@ class Tuple {
 
  protected:
   // stack cache size
-  static const uint32_t kStackCache = 4;
+  static const int kStackCache = 4;
   /*! \brief number of dimension of the tuple */
-  uint32_t ndim_{0};
+  int ndim_{0};
   /*! \brief number of cells allocated in data_heap_ */
-  uint32_t num_heap_allocated_{0};
+  int num_heap_allocated_{0};
   /*! \brief in stack space used to store shape when it is small */
   ValueType data_stack_[kStackCache];
   /*! \brief space to store shape when dimension is big*/
   ValueType* data_heap_{nullptr};
   // internal function to change the dimension
-  inline void SetDim(uint32_t ndim) {
+  inline void SetDim(int ndim) {
     if (ndim > kStackCache &&
         ndim > num_heap_allocated_) {
       delete [] data_heap_;
@@ -348,7 +348,7 @@ class TShape : public Tuple<dim_t> {
    * constructor to construct a shape with all 1.
    * \param ndim the number of dimension
    */
-  inline TShape(uint32_t ndim) {  // NOLINT(*)
+  inline TShape(int ndim) {  // NOLINT(*)
     this->SetDim(ndim);
     std::fill_n(begin(), ndim, 1);
   }
@@ -460,7 +460,7 @@ class TShape : public Tuple<dim_t> {
    */
   template<int dim>
   inline mshadow::Shape<dim> get() const {
-    CHECK_EQ(dim, static_cast<int>(ndim()))
+    CHECK_EQ(dim, ndim())
         << "dimension do not match target dimension " << dim << " vs " << ndim();
     const dim_t *d = this->data();
     mshadow::Shape<dim> s;
@@ -479,7 +479,7 @@ class TShape : public Tuple<dim_t> {
     const dim_t *d = this->data();
     s.shape_[1] = d[ndim() - 1];
     dim_t ymax = 1;
-    for (size_t i = 1; i < ndim(); ++i) {
+    for (int i = 1; i < ndim(); ++i) {
       ymax *= d[i - 1];
     }
     s.shape_[0] = ymax;
@@ -491,7 +491,7 @@ class TShape : public Tuple<dim_t> {
    * \param axis_end The ending axis specified.
    * \return the flat 3d shape
    */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
+  inline mshadow::Shape<3> FlatTo3D(int axis_begin, int axis_end) const {
     CHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
     if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
@@ -500,13 +500,13 @@ class TShape : public Tuple<dim_t> {
     s.shape_[1] = 1;
     s.shape_[2] = 1;
 
-    for (size_t i = 0; i < axis_begin; ++i) {
+    for (int i = 0; i < axis_begin; ++i) {
       s.shape_[0] *= d[i];
     }
-    for (size_t i = axis_begin; i <= axis_end; ++i) {
+    for (int i = axis_begin; i <= axis_end; ++i) {
       s.shape_[1] *= d[i];
     }
-    for (size_t i = axis_end + 1; i < ndim(); ++i) {
+    for (int i = axis_end + 1; i < ndim(); ++i) {
       s.shape_[2] *= d[i];
     }
     return s;
@@ -516,7 +516,7 @@ class TShape : public Tuple<dim_t> {
    * \param axis The axis specified.
    * \return the flat 3d shape
    */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis) const {
+  inline mshadow::Shape<3> FlatTo3D(int axis) const {
     return FlatTo3D(axis, axis);
   }
   inline bool operator==(const TShape &s) const {
@@ -611,9 +611,9 @@ template<typename T>
 struct hash<mxnet::Tuple<T> > {
   /*! \brief hash a Tuple into unsigned int */
   size_t operator()(const mxnet::Tuple<T>& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
+    std::hash<int> hash_int;
+    size_t res = hash_int(val.ndim());
+    for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
     return res;
@@ -625,9 +625,9 @@ template<>
 struct hash<mxnet::TShape> {
   /*! \brief hash a TShape into unsigned int */
   size_t operator()(const mxnet::TShape& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
+    std::hash<int> hash_int;
+    size_t res = hash_int(val.ndim());
+    for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
     return res;
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index f629534dabd0..0709c7457a42 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -165,7 +165,7 @@ inline bool shape_assign(mxnet::TShape *y, const mxnet::TShape& x) {
   } else if (y->ndim() != x.ndim()) {
     return x.ndim() == 0;
   } else {
-    for (size_t i = 0; i < y->ndim(); ++i) {
+    for (int i = 0; i < y->ndim(); ++i) {
       if ((*y)[i] == 0) {
         (*y)[i] = x[i];
       } else if ((*y)[i] != x[i] && x[i] != 0) {
@@ -563,7 +563,7 @@ class OpSignature {
   }
 
   void AddSign(const mxnet::TShape &shape) {
-    for (size_t i = 0; i < shape.ndim(); i++) {
+    for (int i = 0; i < shape.ndim(); i++) {
       hash = hash * 2 + shape[i];
       eles.push_back(shape[i]);
     }

From 0ea2cbca2280aeec27f66b1cec93837c7ac6c770 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 2 Apr 2019 10:48:51 -0700
Subject: [PATCH 04/30] update make config

---
 make/config.mk | 1 +
 make/osx.mk    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/make/config.mk b/make/config.mk
index d0c4350f8b23..4e59beb9500a 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -216,6 +216,7 @@ USE_CPP_PACKAGE = 0
 
 # Use int64_t type for tensor size.
 # This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor size greater than 2 billion
 USE_INT64_TENSOR_SIZE = 0
 
 #----------------------------
diff --git a/make/osx.mk b/make/osx.mk
index 63c113565724..5382004b976b 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -137,6 +137,7 @@ USE_CPP_PACKAGE = 0
 
 # Use int64_t type for tensor size.
 # This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor size greater than 2 billion
 USE_INT64_TENSOR_SIZE = 0
 
 #----------------------------

From 3a3c02f4928d502e070c25c0fb0e98c4449de51d Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 9 Apr 2019 13:47:30 -0700
Subject: [PATCH 05/30] change size_t to index_t and add documentation

---
 3rdparty/mshadow            | 2 +-
 CMakeLists.txt              | 8 ++++++++
 include/mxnet/tensor_blob.h | 7 ++++---
 make/config.mk              | 5 +++--
 make/crosscompile.jetson.mk | 4 +++-
 make/osx.mk                 | 5 +++--
 6 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 655a9923e40b..bb6a03716ed8 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 655a9923e40bbd60e0a640b0b115fabf8bf75e49
+Subproject commit bb6a03716ed8c898e6ce149bfb769d8783c2b29b
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e582bae5e95..11436cd62935 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
 mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT." OFF)
 mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
 mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
+mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
 
 message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
 message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
@@ -295,6 +296,13 @@ else()
   add_definitions(-DMXNET_USE_NCCL=0)
 endif()
 
+if (USE_INT64_TENSOR_SIZE)
+  message(STATUS "Using 64-bit integer for tensor size")
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=1)
+else()
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=0)
+endif()
+
 include(cmake/ChooseBlas.cmake)
 if(USE_CUDA AND FIRST_CUDA)
   include(3rdparty/mshadow/cmake/Utils.cmake)
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 87bdec95a7a5..894be85bbe55 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -219,11 +219,12 @@ class TBlob {
     return shape_.ndim();
   }
   /*!
-   * \brief return size of i-th dimension, start counting from highest dimension
+   * \brief return size of i-th dimension, start counting from highest dimension.
+   * return type needs to be a signed integer.
    * \param idx the dimension count from the highest dimensin
-   * \return the size
+   * \return the size. -1 means unknown size to support zero-size tensor.
    */
-  inline size_t size(index_t idx) const {
+  inline index_t size(index_t idx) const {
     return shape_[idx];
   }
   /*! \brief total number of elements in the tensor */
diff --git a/make/config.mk b/make/config.mk
index 4e59beb9500a..44983cf2b2c0 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -214,9 +214,10 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
-# Use int64_t type for tensor size.
+# Use int64_t type to represent the total number of elements in a tensor
 # This will cause performance degradation reported in issue #14496
-# Set to 1 for large tensor size greater than 2 billion
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
 USE_INT64_TENSOR_SIZE = 0
 
 #----------------------------
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index 1cc57c4e9b45..fe022ed2672f 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -191,8 +191,10 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
-# Use int64_t type for tensor size.
+# Use int64_t type to represent the total number of elements in the tensor
 # This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
 USE_INT64_TENSOR_SIZE = 0
 
 #----------------------------
diff --git a/make/osx.mk b/make/osx.mk
index 5382004b976b..0b5842e59524 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -135,9 +135,10 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
-# Use int64_t type for tensor size.
+# Use int64_t type to represent the total number of elements in a tensor
 # This will cause performance degradation reported in issue #14496
-# Set to 1 for large tensor size greater than 2 billion
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
 USE_INT64_TENSOR_SIZE = 0
 
 #----------------------------

From b1ca6dd2f9ccf78150e87bc541e5de97736ab77f Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 15 Apr 2019 10:40:12 -0700
Subject: [PATCH 06/30] update mshadow submodule to master

---
 .gitmodules      | 2 +-
 3rdparty/mshadow | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 9299905764af..e0ffec11bfd0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/mshadow"]
 	path = 3rdparty/mshadow
-	url = https://github.com/apeforest/mshadow.git
+	url = https://github.com/dmlc/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index bb6a03716ed8..6e94643bdf1d 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit bb6a03716ed8c898e6ce149bfb769d8783c2b29b
+Subproject commit 6e94643bdf1d51a505b147f28c358fb71070b8fd

From 5443fd5df269b360e5db8bc03eb91e386badfcca Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 15 Apr 2019 11:30:03 -0700
Subject: [PATCH 07/30] fix compilation warning

---
 src/operator/contrib/index_copy-inl.h       |  2 +-
 src/operator/custom/custom.cc               |  4 +-
 src/operator/nn/pooling.cc                  |  2 +-
 src/operator/random/multisample_op.h        |  2 +-
 src/operator/random/sample_multinomial_op.h |  4 +-
 src/operator/tensor/indexing_op.h           |  6 +-
 src/operator/tensor/matrix_op-inl.h         | 72 ++++++++++-----------
 7 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/src/operator/contrib/index_copy-inl.h b/src/operator/contrib/index_copy-inl.h
index 903dee13272b..69ac94e853fa 100644
--- a/src/operator/contrib/index_copy-inl.h
+++ b/src/operator/contrib/index_copy-inl.h
@@ -64,7 +64,7 @@ inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->at(1).ndim(), 1);
   // Shape matching
   CHECK_EQ(in_attrs->at(0).ndim(), in_attrs->at(2).ndim());
-  for (size_t i = 0; i < in_attrs->at(0).ndim(); ++i) {
+  for (int i = 0; i < in_attrs->at(0).ndim(); ++i) {
     if (i == 0) {
       CHECK_GE(in_attrs->at(0)[i], in_attrs->at(2)[i]);
     } else {
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 46249c9bbcc6..5d0455298ea6 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -137,7 +137,7 @@ bool InferShape(const NodeAttrs& attrs,
   for (size_t i = 0; i < in_shape->size(); ++i) {
     shapes[i] = ptr;
     ndims[i] = (*in_shape)[i].ndim();
-    for (size_t j = 0; j < (*in_shape)[i].ndim(); ++j, ++ptr) {
+    for (int j = 0; j < (*in_shape)[i].ndim(); ++j, ++ptr) {
       *ptr = static_cast<uint32_t>((*in_shape)[i][j]);
     }
   }
@@ -263,7 +263,7 @@ OpStatePtr CreateState(const NodeAttrs& attrs, Context ctx,
   for (size_t i = 0; i < in_shape.size(); ++i) {
     shapes[i] = ptr;
     ndims[i] = in_shape[i].ndim();
-    for (size_t j = 0; j < in_shape[i].ndim(); ++j, ++ptr) {
+    for (int j = 0; j < in_shape[i].ndim(); ++j, ++ptr) {
       *ptr = static_cast<uint32_t>(in_shape[i][j]);
     }
   }
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 2d16604baa20..7cdef54abd83 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -133,7 +133,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       default:
         LOG(FATAL) << "Unsupported tensor layout " << param.layout.value();
     }
-    for (size_t i{1}; i < dshape.ndim(); i++)
+    for (int i = 1; i < dshape.ndim(); i++)
       if (i != c_index)
         oshape[i] = 1;
     out_shape->clear();
diff --git a/src/operator/random/multisample_op.h b/src/operator/random/multisample_op.h
index e9f266932e13..7d5e256297ad 100644
--- a/src/operator/random/multisample_op.h
+++ b/src/operator/random/multisample_op.h
@@ -66,7 +66,7 @@ inline bool MultiSampleOpShape(const nnvm::NodeAttrs& attrs,
   // Get shape to be sampled for each parameter set.
   const MultiSampleParam& param = nnvm::get<MultiSampleParam>(attrs.parsed);
   mxnet::TShape sshape = param.shape;
-  for (size_t i = 0; i < sshape.ndim(); ++i) {
+  for (int i = 0; i < sshape.ndim(); ++i) {
     CHECK_GT(sshape[i], 0) << "shape parameter must be non-zero within each dimension";
   }
   // Examine output shape whether it is already defined.
diff --git a/src/operator/random/sample_multinomial_op.h b/src/operator/random/sample_multinomial_op.h
index e76cd646b850..eb4ddd8e90bd 100644
--- a/src/operator/random/sample_multinomial_op.h
+++ b/src/operator/random/sample_multinomial_op.h
@@ -87,10 +87,10 @@ inline bool SampleMultinomialOpShape(const nnvm::NodeAttrs& attrs,
   }
 
   mxnet::TShape oshape(ishape.ndim() - 1 + param.shape.ndim());
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) {
+  for (int i = 0; i < ishape.ndim() - 1; ++i) {
     oshape[i] = ishape[i];
   }
-  for (size_t i = 0; i < param.shape.ndim(); ++i) {
+  for (int i = 0; i < param.shape.ndim(); ++i) {
     oshape[i + ishape.ndim() - 1] = param.shape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 8979531fef4e..d6e89e176afe 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -1272,7 +1272,9 @@ inline bool GatherNDShape(const nnvm::NodeAttrs& attrs,
 
   mxnet::TShape oshape(ishape.ndim() - 1 + dshape.ndim() - ishape[0]);
 
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) oshape[i] = ishape[i+1];
+  for (int i = 0; i < ishape.ndim() - 1; ++i) {
+    oshape[i] = ishape[i+1];
+  }
   for (int i = 0; i < dshape.ndim() - ishape[0]; ++i) {
     oshape[ishape.ndim()-1+i] = dshape[ishape[0] + i];
   }
@@ -1370,7 +1372,7 @@ inline bool ScatterNDShape(const nnvm::NodeAttrs& attrs,
 
   bool valid = dshape.ndim() == ishape.ndim() - 1 + oshape.ndim() - ishape[0];
 
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) {
+  for (int i = 0; i < ishape.ndim() - 1; ++i) {
     valid = valid && dshape[i] == ishape[i+1];
   }
   for (int i = 0; i < oshape.ndim() - ishape[0]; ++i) {
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 5eecda622729..38b153e43798 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -224,8 +224,8 @@ inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape &dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
-  uint32_t target_dim = 1;
-  for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+  int target_dim = 1;
+  for (int i = 1; i < dshape.ndim(); ++i) {
     target_dim *= dshape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape2(dshape[0], target_dim));
@@ -335,7 +335,7 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
     }
   } else {
     CHECK_EQ(shp.ndim(), param.axes.ndim());
-    for (size_t i = 0; i < shp.ndim(); ++i) {
+    for (int i = 0; i < shp.ndim(); ++i) {
       CHECK(param.axes[i] < static_cast<int64_t>(shp.ndim()));
       ret[i] = shp[param.axes[i]];
     }
@@ -726,7 +726,7 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
-    for (index_t i = 0; i < param.begin.ndim(); ++i) {
+    for (int i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &oshape);
     }
@@ -1110,9 +1110,9 @@ inline void GetSliceAxisParams(const SliceAxisParam& param, const mxnet::TShape&
                            int* axis, index_t* begin, index_t* end) {
   *axis = param.axis;
   if (*axis < 0) {
-    *axis += static_cast<int>(ishape.ndim());
+    *axis += ishape.ndim();
   }
-  CHECK(*axis < static_cast<int>(ishape.ndim()) && *axis >= 0) <<
+  CHECK(*axis < ishape.ndim() && *axis >= 0) <<
     "Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=" <<
     param.axis << ", src_ndim=" << ishape.ndim() << ", transformed axis=" << *axis;
   index_t axis_size = static_cast<index_t>(ishape[*axis]);
@@ -1153,8 +1153,8 @@ inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
   index_t begin, end;
   GetSliceAxisParams(param, ishape, &axis, &begin, &end);
   mxnet::TShape shape(ishape.ndim());
-  for (index_t i = 0; i < ishape.ndim(); ++i) {
-    if (static_cast<int>(i) == axis) {
+  for (int i = 0; i < ishape.ndim(); ++i) {
+    if (i == axis) {
       shape[i] = static_cast<index_t>(end - begin);
     } else {
       shape[i] = ishape[i];
@@ -1177,7 +1177,7 @@ void SliceAxis(const nnvm::NodeAttrs& attrs,
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
-  int ndim = static_cast<int>(outputs[0].ndim());
+  int ndim = outputs[0].ndim();
 
   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -1212,7 +1212,7 @@ void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, outputs[0].shape_, &axis, &begin, &end);
-  int ndim = static_cast<int>(outputs[0].shape_.ndim());
+  int ndim = outputs[0].shape_.ndim();
 
   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -1278,12 +1278,12 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
   } else {
     mxnet::TShape shape(ishape);
     for (index_t i = 0; i < param.axes.ndim(); ++i) {
-      int axis = static_cast<int>(param.axes[i]);
+      int axis = param.axes[i];
       if (axis < 0) {
-        axis += static_cast<int>(ishape.ndim());
+        axis += ishape.ndim();
       }
       CHECK_GE(axis, 0)
-        << "Slice axis: " << static_cast<int>(param.axes[i]) << " too small";
+        << "Slice axis: " << param.axes[i] << " too small";
       CHECK_GT(ishape.ndim(), axis)
         << "Slice axis: " << axis << " exceeds first input: " << ishape.ndim();
       CHECK_GT(from_shape.ndim(), axis)
@@ -1314,16 +1314,16 @@ inline void SliceLikeInferRanges(const mxnet::TShape& dshape,
       ps[i] = 1;
     }
   } else {
-    for (index_t i = 0; i < axes.ndim(); ++i) {
-      int axis = static_cast<int>(axes[i]);
+    for (int i = 0; i < axes.ndim(); ++i) {
+      int axis = axes[i];
       if (axis < 0) {
-        axis += static_cast<int>(dshape.ndim());
+        axis += dshape.ndim();
       }
       CHECK_GE(axis, 0)
-        << "Slice axis: " << static_cast<int>(axes[i]) << " too small";
-      CHECK_LT(axis, static_cast<int>(dshape.ndim()))
+        << "Slice axis: " << axes[i] << " too small";
+      CHECK_LT(axis, dshape.ndim())
         << "Slice axis: " << axis << " exceeds first input: " << dshape.ndim();
-      CHECK_LT(axis, static_cast<int>(fshape.ndim()))
+      CHECK_LT(axis, fshape.ndim())
         << "Slice axis: " << axis << " exceeds first input: " << fshape.ndim();
       pb[axis] = 0;
       pe[axis] = fshape[axis];
@@ -1542,7 +1542,7 @@ inline void GetRepeatParams(const RepeatParam& param, const mxnet::TShape& ishap
   CHECK_GE(*repeats, 0) << "repeats cannot be a negative number";
   *axisOpt = param.axis;
   if (static_cast<bool>(*axisOpt)) {
-    int ndims = static_cast<int>(ishape.ndim());
+    int ndims = ishape.ndim();
     int axis = axisOpt->value();
     if (axis < 0) {
       axis += ndims;
@@ -1569,14 +1569,14 @@ inline bool RepeatOpShape(const nnvm::NodeAttrs& attrs,
 
   // If repeats > 0, multiply the size of the corresponding axis by repeats
   if (static_cast<bool>(axisOpt)) {
-    int ndims = static_cast<int>(ishape.ndim());
+    int ndims = ishape.ndim();
     int axis = axisOpt.value();
     if (axis < 0) {
       axis += ndims;
     }
     mxnet::TShape shape(ishape.ndim());
-    for (index_t i = 0; i < ishape.ndim(); ++i) {
-      if (static_cast<int>(i) == axis) {
+    for (int i = 0; i < ishape.ndim(); ++i) {
+      if (i == axis) {
         shape[i] = static_cast<index_t>(repeats) * ishape[i];
       } else {
         shape[i] = ishape[i];
@@ -1616,11 +1616,11 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
   const int repeats) {
   if (static_cast<bool>(axisOpt)) {
     int axis = axisOpt.value();
-    int ndim = static_cast<int>(ishape.ndim());
+    int ndim = ishape.ndim();
     if (axis < 0)  {
       axis += ndim;
     }
-    CHECK(axis >= 0 && axis < static_cast<int>(ishape.ndim())) << "Invalid input of axis";
+    CHECK(axis >= 0 && axis < ishape.ndim()) << "Invalid input of axis";
 
     // reshape the input tensor by adding a dim at the (axis+1)-th dim
     mxnet::TShape rshape(ishape.ndim()+1);
@@ -1633,7 +1633,7 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
     }
     rshape[i] = 1;
     bshape[i] = repeats;
-    while (i < static_cast<int>(ishape.ndim())) {
+    while (i < ishape.ndim()) {
       rshape[i+1] = ishape[i];
       bshape[i+1] = ishape[i];
       ++i;
@@ -1757,13 +1757,13 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
     return true;
   }
-  for (size_t i = 0; i < reps.ndim(); ++i) {
+  for (int i = 0; i < reps.ndim(); ++i) {
     CHECK_GT(reps[i], 0) << "invalid reps=" << i << ", dim size must be greater than zero";
   }
   mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()));
-  int i1 = static_cast<int>(ishape.ndim()) - 1;
-  int i2 = static_cast<int>(reps.ndim()) - 1;
-  for (int i = static_cast<int>(oshape.ndim()) - 1; i >= 0; --i) {
+  int i1 = ishape.ndim() - 1;
+  int i2 = reps.ndim() - 1;
+  for (int i = oshape.ndim() - 1; i >= 0; --i) {
     if (i1 >= 0 && i2 >= 0) {
       oshape[i] = ishape[i1--] * reps[i2--];
     } else if (i1 >= 0) {
@@ -1808,9 +1808,9 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForTileOp(
   // The shape of the input tensor after adding new axes before each dim
   mxnet::TShape rshape(bshape.ndim());
 
-  int i1 = static_cast<int>(ishape.ndim()) - 1;
-  int i2 = static_cast<int>(reps.ndim()) - 1;
-  for (int i = static_cast<int>(bshape.ndim()) - 1; i >= 0; --i) {
+  int i1 = ishape.ndim() - 1;
+  int i2 = reps.ndim() - 1;
+  for (int i = bshape.ndim() - 1; i >= 0; --i) {
     if (0 == (i & 1)) {
       bshape[i] = (i2 >= 0? reps[i2--] : 1);
       rshape[i] = 1;
@@ -1986,7 +1986,7 @@ void ReverseOpForward(const nnvm::NodeAttrs& attrs,
   std::vector<index_t>  trailing_(param.axis.ndim());
   index_t reverse_index = 0;
   for (int axis : param.axis) {
-    CHECK_LT(axis, static_cast<int>(ishape.ndim()));
+    CHECK_LT(axis, ishape.ndim());
     stride_[reverse_index] = ishape[axis];
     trailing_[reverse_index] = 1;
     for (index_t i2 = axis + 1; i2 < ishape.ndim(); ++i2) {
@@ -2599,7 +2599,7 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
   int num_outputs = (param.sections > 0) ? indices.ndim() - 1 : indices.ndim();
   // Pre-compute squeezed output shape for future usage
   mxnet::TShape squeezed_dshape = dshape;
-  for (int d = real_axis; d < static_cast<int>(squeezed_dshape.ndim()) - 1; ++d) {
+  for (int d = real_axis; d < squeezed_dshape.ndim() - 1; ++d) {
     squeezed_dshape[d] = squeezed_dshape[d+1];
   }
   squeezed_dshape = mxnet::TShape(&squeezed_dshape[0], &squeezed_dshape[squeezed_dshape.ndim()-1]);
@@ -2631,7 +2631,7 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
       back_calculate_dshape[real_axis] += (*out_attrs)[i][real_axis];
     }
   }
-  for (int d = real_axis + 1; d < static_cast<int>(ishape.ndim()); ++d) {
+  for (int d = real_axis + 1; d < ishape.ndim(); ++d) {
     if (param.squeeze_axis) {
       back_calculate_dshape[d] = (*out_attrs)[0][d - 1];
     } else {

From 872255f200099a3a488a02e702e139ea92327260 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 15 Apr 2019 12:10:29 -0700
Subject: [PATCH 08/30] fix compiler warning

---
 src/io/iter_batchloader.h                      |  2 +-
 src/operator/control_flow.cc                   |  6 +++---
 src/operator/nn/pooling.cc                     |  2 +-
 .../quantization/quantized_flatten-inl.h       |  4 ++--
 src/operator/tensor/matrix_op-inl.h            | 18 +++++++++---------
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index 69eb05f7d729..279690b594e6 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -78,7 +78,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
 
     // if overflow from previous round, directly return false, until before first is called
     if (num_overflow_ != 0) return false;
-    index_t top = 0;
+    size_t top = 0;
 
     while (base_->Next()) {
       const DataInst& d = base_->Value();
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index ac6fea7c143b..87da1f9871f9 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -322,7 +322,7 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
 
     auto out = mxnet::TShape(g_out_shape.ndim() + 1);
     out[0] = len;
-    for (size_t i = 1; i < out.ndim(); i++)
+    for (int i = 1; i < out.ndim(); i++)
       out[i] = g_out_shape[i - 1];
     SHAPE_ASSIGN_CHECK(*out_shape, i, out);
   }
@@ -346,7 +346,7 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
     } else {
       auto in = mxnet::TShape(shape.ndim() + 1);
       in[0] = len;
-      for (size_t i = 1; i < in.ndim(); i++)
+      for (int i = 1; i < in.ndim(); i++)
         in[i] = shape[i - 1];
       SHAPE_ASSIGN_CHECK(*in_shape, i, in);
     }
@@ -801,7 +801,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
       }
       auto out = mxnet::TShape(g_out_shape.ndim() + 1);
       out[0] = params.max_iterations;
-      for (size_t i = 1; i < out.ndim(); i++)
+      for (int i = 1; i < out.ndim(); i++)
         out[i] = g_out_shape[i - 1];
       SHAPE_ASSIGN_CHECK(*out_shape, i, out);
     }
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 7cdef54abd83..9d8c2fed8b50 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -118,7 +118,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
   int layout = param.GetLayout(dshape.ndim());
   if (param.global_pool) {
     mxnet::TShape oshape = dshape;
-    size_t c_index = 0;
+    int c_index = 0;
     switch (layout) {
       case mshadow::kNCW:
       case mshadow::kNCHW:
diff --git a/src/operator/quantization/quantized_flatten-inl.h b/src/operator/quantization/quantized_flatten-inl.h
index 99a262de19ca..064cf42ef0a0 100644
--- a/src/operator/quantization/quantized_flatten-inl.h
+++ b/src/operator/quantization/quantized_flatten-inl.h
@@ -88,8 +88,8 @@ inline bool QuantizedFlattenShape(const nnvm::NodeAttrs& attrs,
   const mxnet::TShape &dshape = (*in_attrs)[0];
   if (shape_is_none(dshape)) return false;
 
-  uint32_t target_dim = 1;
-  for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+  int target_dim = 1;
+  for (int i = 1; i < dshape.ndim(); ++i) {
     target_dim *= dshape[i];
   }
 
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 38b153e43798..93c3d01e3888 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -724,13 +724,13 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape oshape = dshape;
 
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
-    common::StaticArray<index_t, ndim> begin, end, step;
+    common::StaticArray<int, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (int i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &oshape);
     }
-  });
+  })
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return !shape_is_none(dshape) && !shape_is_none(oshape);
@@ -939,11 +939,11 @@ inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
-    for (index_t i = 0; i < param.begin.ndim(); ++i) {
+    for (int i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &vshape);
     }
-  });
+  })
   SHAPE_ASSIGN_CHECK(*in_attrs, 1, vshape);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, dshape);
   return true;
@@ -2152,7 +2152,7 @@ struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
 inline size_t SqueezeShapeHelper(mxnet::TShape* shape) {
   CHECK(shape != nullptr);
   size_t count = 0;
-  for (size_t i = 0; i < shape->ndim(); ++i) {
+  for (int i = 0; i < shape->ndim(); ++i) {
     if ((*shape)[i] == 0) {
       ++count;
     } else {
@@ -2175,7 +2175,7 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
   if (param.axis.has_value()) {
     // preprocess axis
     mxnet::TShape axes = param.axis.value();
-    for (size_t i = 0; i < axes.ndim(); ++i) {
+    for (int i = 0; i < axes.ndim(); ++i) {
       if (axes[i] < 0) {
         axes[i] += dndim;
         CHECK_GE(axes[i], 0)
@@ -2190,7 +2190,7 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       oshape[axes[i]] = 0;
     }
   } else {
-    for (size_t i = 0; i < oshape.ndim(); ++i) {
+    for (int i = 0; i < oshape.ndim(); ++i) {
       if (oshape[i] == 1) oshape[i] = 0;
     }
   }
@@ -2237,7 +2237,7 @@ inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] / (block * block);
-  size_t i = 2;
+  int i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] * block;
     ++i;
@@ -2404,7 +2404,7 @@ inline bool SpaceToDepthOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] * block * block;
-  uint32_t i = 2;
+  int i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] / block;
     ++i;

From 4bd1805d7676f494e7f70f496b6c086316154e73 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 15 Apr 2019 14:17:30 -0700
Subject: [PATCH 09/30] fix compiler warning

---
 src/io/iter_sparse_batchloader.h    | 2 +-
 src/operator/pad-inl.h              | 2 +-
 src/operator/tensor/matrix_op-inl.h | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/io/iter_sparse_batchloader.h b/src/io/iter_sparse_batchloader.h
index 17c509a0f56b..c0d856df89ec 100644
--- a/src/io/iter_sparse_batchloader.h
+++ b/src/io/iter_sparse_batchloader.h
@@ -67,7 +67,7 @@ class SparseBatchLoader : public BatchLoader, public SparseIIterator<TBlobBatch>
     this->head_ = 0;
     // if overflown from previous round, directly return false, until before first is called
     if (num_overflow_ != 0) return false;
-    index_t top = 0;
+    size_t top = 0;
     offsets_.clear();
     while (sparse_base_->Next()) {
       const DataInst& inst = sparse_base_->Value();
diff --git a/src/operator/pad-inl.h b/src/operator/pad-inl.h
index 140d7099e817..89b0ab7780b6 100644
--- a/src/operator/pad-inl.h
+++ b/src/operator/pad-inl.h
@@ -230,7 +230,7 @@ class PadProp : public OperatorProperty {
       }
     }
     mxnet::TShape oshape = dshape;
-    for (size_t i = 0; i < dshape.ndim(); ++i) {
+    for (int i = 0; i < dshape.ndim(); ++i) {
       oshape[i] =
           param_.pad_width[2 * i] + param_.pad_width[2 * i + 1] + dshape[i];
     }
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 93c3d01e3888..75a84e2d00fa 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -724,9 +724,9 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape oshape = dshape;
 
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
-    for (int i = 0; i < param.begin.ndim(); ++i) {
+    for (uint32_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &oshape);
     }
@@ -939,7 +939,7 @@ inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
-    for (int i = 0; i < param.begin.ndim(); ++i) {
+    for (uint32_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &vshape);
     }

From 08e9b101fa723b2166bc6db4b0d92a86cd6ad626 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 15 Apr 2019 14:31:39 -0700
Subject: [PATCH 10/30] fix compiler warning

---
 src/operator/nn/mkldnn/mkldnn_base-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 0a89c0f31981..294723b68bd3 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -462,7 +462,7 @@ mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc p
                                                 mkldnn_memory_format_t format);
 
 inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int ndims) {
-  if (shape.ndim() != (size_t)ndims)
+  if (shape.ndim() != ndims)
     return false;
   for (int i = 0; i < ndims; i++)
     if (shape[i] != dims[i])

From d3d6cc6b44f8356ace818ee8adb918f65274b0cf Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 15 Apr 2019 14:44:58 -0700
Subject: [PATCH 11/30] fix compiler warning

---
 src/operator/nn/mkldnn/mkldnn_transpose.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_transpose.cc b/src/operator/nn/mkldnn/mkldnn_transpose.cc
index 0986d0616f75..65e49183c78c 100644
--- a/src/operator/nn/mkldnn/mkldnn_transpose.cc
+++ b/src/operator/nn/mkldnn/mkldnn_transpose.cc
@@ -57,7 +57,7 @@ class MKLDNNTransposeForward {
     auto axes_ndim = param.axes.ndim();
     auto axes = mxnet::TShape(data_ndim);
     if (axes_ndim == 0) {
-      for (size_t i = 0; i < data_ndim; i++) {
+      for (int i = 0; i < data_ndim; i++) {
         axes[i] = data_ndim - i - 1;
       }
     } else {
@@ -79,7 +79,7 @@ class MKLDNNTransposeForward {
     dst_fmt.data_type = mkldnn_f32;
     dst_fmt.format = mkldnn_blocked;
 
-    for (size_t i = 0; i < data_ndim; i++)
+    for (int i = 0; i < data_ndim; i++)
       dst_fmt.dims[i] = shape[i];
 
     unsigned int total_stride = 1;

From 7e3ed63858b9a0292fd2817df6b9067d96b1e043 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 15 Apr 2019 15:45:17 -0700
Subject: [PATCH 12/30] fix compiler error

---
 src/operator/swapaxis-inl.h             | 4 ++--
 src/operator/tensor/matrix_op-inl.h     | 2 +-
 src/operator/tensor/sparse_retain-inl.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index ce835084ab32..41cb940d957a 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -72,8 +72,8 @@ class SwapAxisOp : public Operator {
                     uint32_t dim1, uint32_t dim2) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    index_t ndim_in = shape.ndim();
-    index_t si;
+    int ndim_in = shape.ndim();
+    int si;
 
     if (dim1 > dim2) {
       std::swap(dim1, dim2);
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 30a42bdc6763..adfec893c50c 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -656,7 +656,7 @@ inline void GetIndexRange(const mxnet::TShape& dshape,
       << "step and begin must have the same length";
   }
 
-  for (index_t i = 0; i < param_begin.ndim(); ++i) {
+  for (int i = 0; i < param_begin.ndim(); ++i) {
     index_t s = param_step.ndim() != 0U && param_step[i].has_value() ? param_step[i].value() : 1;
     CHECK_NE(s, 0) << "slice op step[" << i << "] cannot be 0";
 
diff --git a/src/operator/tensor/sparse_retain-inl.h b/src/operator/tensor/sparse_retain-inl.h
index 951bf80b81b8..04860e6f369f 100644
--- a/src/operator/tensor/sparse_retain-inl.h
+++ b/src/operator/tensor/sparse_retain-inl.h
@@ -290,7 +290,7 @@ void SparseRetainOpForwardRspImpl(mshadow::Stream<xpu> *s,
     Kernel<set_zero, xpu>::Launch(s, output_data.Size(), output_data.dptr<DType>());
     MSHADOW_IDX_TYPE_SWITCH(output_idx.type_flag_, RType, {  // row index data type
       MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, {  // index array data type
-        if (input_idx.Size() == input_nd.shape()[0]) {  // input rsp is dense
+        if (input_idx.Size() == static_cast<size_t>(input_nd.shape()[0])) {  // input rsp is dense
           using namespace mshadow;
           // copy indices
           Tensor<xpu, 1, RType> output_idx_tensor = output_idx.FlatTo1D<xpu, RType>(s);

From 54735db2f92f295111366ccaaa2987a4b99b822b Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 16 Apr 2019 10:42:32 -0700
Subject: [PATCH 13/30] change nnvm::Tuple to mxnet::Tuple

---
 src/c_api/c_api.cc                            |  2 +-
 src/common/serialization.h                    | 12 ++---
 src/imperative/cached_op.h                    |  8 ++--
 src/io/image_det_aug_default.cc               |  4 +-
 src/io/image_io.cc                            |  2 +-
 .../contrib/deformable_convolution-inl.h      |  2 +-
 src/operator/contrib/dgl_graph.cc             |  4 +-
 src/operator/contrib/multi_proposal-inl.h     | 12 ++---
 src/operator/contrib/multibox_detection-inl.h |  2 +-
 src/operator/contrib/multibox_detection.cc    |  2 +-
 src/operator/contrib/multibox_detection.cu    |  2 +-
 src/operator/contrib/multibox_prior-inl.h     |  8 ++--
 src/operator/contrib/multibox_target-inl.h    |  2 +-
 src/operator/contrib/multibox_target.cc       |  2 +-
 src/operator/contrib/multibox_target.cu       |  2 +-
 src/operator/contrib/proposal-inl.h           | 12 ++---
 src/operator/control_flow.cc                  | 36 +++++++--------
 src/operator/image/image_random-inl.h         | 12 ++---
 src/operator/image/resize-inl.h               |  4 +-
 src/operator/optimizer_op-inl.h               |  8 ++--
 src/operator/subgraph_op_common.h             |  4 +-
 src/operator/swapaxis-inl.h                   |  6 +--
 src/operator/tensor/histogram-inl.h           |  4 +-
 src/operator/tensor/matrix_op-inl.h           | 44 +++++++++----------
 src/operator/tensor/slice-inl.h               |  6 +--
 25 files changed, 101 insertions(+), 101 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 45197aafe019..1af55a3543cc 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -471,7 +471,7 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
   NDArray *ptr = new NDArray();
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
-  nnvm::Tuple<dim_t> shape(dims, dims+ndim);
+  mxnet::Tuple<dim_t> shape(dims, dims+ndim);
   CHECK_GT(arr->shape().Size(), 0) << "Source ndarray's shape is undefined. Input shape: "
     << arr->shape();
   mxnet::TShape new_shape = mxnet::op::InferReshapeShape(shape, arr->shape(), reverse);
diff --git a/src/common/serialization.h b/src/common/serialization.h
index 8192ee210a1c..c22d8bc82270 100644
--- a/src/common/serialization.h
+++ b/src/common/serialization.h
@@ -49,7 +49,7 @@ template<typename T>
 inline size_t SerializedSize(const T &obj);
 
 template<typename T>
-inline size_t SerializedSize(const nnvm::Tuple <T> &obj);
+inline size_t SerializedSize(const mxnet::Tuple <T> &obj);
 
 template<typename K, typename V>
 inline size_t SerializedSize(const std::map <K, V> &obj);
@@ -64,7 +64,7 @@ template<typename T>
 inline void Serialize(const T &obj, char **buffer);
 
 template<typename T>
-inline void Serialize(const nnvm::Tuple <T> &obj, char **buffer);
+inline void Serialize(const mxnet::Tuple <T> &obj, char **buffer);
 
 template<typename K, typename V>
 inline void Serialize(const std::map <K, V> &obj, char **buffer);
@@ -79,7 +79,7 @@ template<typename T>
 inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos);
 
 template<typename T>
-inline void Deserialize(nnvm::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos);
+inline void Deserialize(mxnet::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos);
 
 template<typename K, typename V>
 inline void Deserialize(std::map <K, V> *obj, const std::string &buffer, size_t *curr_pos);
@@ -102,7 +102,7 @@ inline size_t SerializedSize(const T &obj) {
 }
 
 template<typename T>
-inline size_t SerializedSize(const nnvm::Tuple <T> &obj) {
+inline size_t SerializedSize(const mxnet::Tuple <T> &obj) {
   if (is_container<T>::value) {
     size_t sum_val = 4;
     for (const auto& el : obj) {
@@ -180,7 +180,7 @@ inline void Serialize(const T &obj, char **buffer) {
 }
 
 template<typename T>
-inline void Serialize(const nnvm::Tuple <T> &obj, char **buffer) {
+inline void Serialize(const mxnet::Tuple <T> &obj, char **buffer) {
   uint32_t size = obj.ndim();
   std::memcpy(*buffer, &size, 4);
   *buffer += 4;
@@ -244,7 +244,7 @@ inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos) {
 }
 
 template<typename T>
-inline void Deserialize(nnvm::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos) {
+inline void Deserialize(mxnet::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos) {
   uint32_t size = obj->ndim();
   std::memcpy(&size, &buffer[*curr_pos], 4);
   *curr_pos += 4;
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index b3192dc8281b..14b373edea57 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -36,8 +36,8 @@ struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
   bool static_alloc;
   bool static_shape;
   bool is_dynamic;
-  nnvm::Tuple<uint32_t> data_indices;
-  nnvm::Tuple<uint32_t> param_indices;
+  mxnet::Tuple<uint32_t> data_indices;
+  mxnet::Tuple<uint32_t> param_indices;
   std::string subgraph;
   DMLC_DECLARE_PARAMETER(CachedOpConfig) {
     DMLC_DECLARE_FIELD(static_alloc)
@@ -59,10 +59,10 @@ struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
     .set_default(Imperative::BulkExecMaxNodeTrainBwd())
     .describe("Segment size of bulk execution during backward pass.");
     DMLC_DECLARE_FIELD(data_indices)
-    .set_default(nnvm::Tuple<uint32_t>())
+    .set_default(mxnet::Tuple<uint32_t>())
     .describe("Position of argument variables.");
     DMLC_DECLARE_FIELD(param_indices)
-    .set_default(nnvm::Tuple<uint32_t>())
+    .set_default(mxnet::Tuple<uint32_t>())
     .describe("Position of parameters.");
     DMLC_DECLARE_FIELD(subgraph)
     .set_default(std::string(""))
diff --git a/src/io/image_det_aug_default.cc b/src/io/image_det_aug_default.cc
index 74e51b51603b..3bd37200b8e7 100644
--- a/src/io/image_det_aug_default.cc
+++ b/src/io/image_det_aug_default.cc
@@ -34,7 +34,7 @@
 namespace mxnet {
 namespace io {
 
-using nnvm::Tuple;
+using mxnet::Tuple;
 
 namespace image_det_aug_default_enum {
 enum ImageDetAugDefaultCropEmitMode {kCenter, kOverlap};
@@ -462,7 +462,7 @@ class DefaultImageDetAugmenter : public ImageAugmenter {
 
   /*! \brief Check number of crop samplers and given parameters */
   template<typename DType>
-  void ValidateCropParameters(nnvm::Tuple<DType> *param, const int num_sampler) {
+  void ValidateCropParameters(mxnet::Tuple<DType> *param, const int num_sampler) {
     if (num_sampler == 1) {
       CHECK_EQ(param->ndim(), 1);
     } else if (num_sampler > 1) {
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index 2196983928bb..8b5674027a4e 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -295,7 +295,7 @@ struct MakeBorderParam : public dmlc::Parameter<MakeBorderParam> {
   int top, bot, left, right;
   int type;
   double value;
-  nnvm::Tuple<double> values;
+  mxnet::Tuple<double> values;
   DMLC_DECLARE_PARAMETER(MakeBorderParam) {
     DMLC_DECLARE_FIELD(top)
     .describe("Top margin.");
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
index f50641fca6d6..6aefe9668fac 100644
--- a/src/operator/contrib/deformable_convolution-inl.h
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -129,7 +129,7 @@ class DeformableConvolutionOp : public Operator {
     // calculate the shape of col_buffer
     mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
     col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-    for (size_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+    for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = out_data[0].shape_[i + 1];
     }
     // create a column buffer using workspace and col_buffer_shape
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index f19af84ce9c6..ec6d81946d0e 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -1265,7 +1265,7 @@ void EdgeIDForwardCsrImpl(const OpContext& ctx,
   CHECK_EQ(req, kWriteTo) << "EdgeID with CSR only supports kWriteTo";
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const NDArray& u = inputs[1];
-  const nnvm::dim_t out_elems = u.shape().Size();
+  const dim_t out_elems = u.shape().Size();
   if (!inputs[0].storage_initialized()) {
     MSHADOW_TYPE_SWITCH(output.dtype(), DType, {
       Kernel<mxnet_op::op_with_req<mshadow_op::identity, kWriteTo>, xpu>::Launch(
@@ -1422,7 +1422,7 @@ the data value of float32.
 struct SubgraphCompactParam : public dmlc::Parameter<SubgraphCompactParam> {
   int num_args;
   bool return_mapping;
-  nnvm::Tuple<nnvm::dim_t> graph_sizes;
+  mxnet::Tuple<dim_t> graph_sizes;
   DMLC_DECLARE_PARAMETER(SubgraphCompactParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
     .describe("Number of input arguments.");
diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h
index 4b9a41c2fa87..9d4e652783ef 100644
--- a/src/operator/contrib/multi_proposal-inl.h
+++ b/src/operator/contrib/multi_proposal-inl.h
@@ -56,8 +56,8 @@ struct MultiProposalParam : public dmlc::Parameter<MultiProposalParam> {
   int rpn_post_nms_top_n;
   float threshold;
   int rpn_min_size;
-  nnvm::Tuple<float> scales;
-  nnvm::Tuple<float> ratios;
+  mxnet::Tuple<float> scales;
+  mxnet::Tuple<float> ratios;
   int feature_stride;
   bool output_score;
   bool iou_loss;
@@ -73,10 +73,10 @@ struct MultiProposalParam : public dmlc::Parameter<MultiProposalParam> {
     DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
     .describe("Minimum height or width in proposal");
     tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f;
-    DMLC_DECLARE_FIELD(scales).set_default(nnvm::Tuple<float>(tmp, tmp + 4))
+    DMLC_DECLARE_FIELD(scales).set_default(mxnet::Tuple<float>(tmp, tmp + 4))
     .describe("Used to generate anchor windows by enumerating scales");
     tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f;
-    DMLC_DECLARE_FIELD(ratios).set_default(nnvm::Tuple<float>(tmp, tmp + 3))
+    DMLC_DECLARE_FIELD(ratios).set_default(mxnet::Tuple<float>(tmp, tmp + 3))
     .describe("Used to generate anchor windows by enumerating ratios");
     DMLC_DECLARE_FIELD(feature_stride).set_default(16)
     .describe("The size of the receptive field each unit in the convolution layer of the rpn,"
@@ -214,8 +214,8 @@ inline void _Transform(float scale,
 
 // out_anchors must have shape (n, 5), where n is ratios.size() * scales.size()
 inline void GenerateAnchors(const std::vector<float>& base_anchor,
-                            const nnvm::Tuple<float>& ratios,
-                            const nnvm::Tuple<float>& scales,
+                            const mxnet::Tuple<float>& ratios,
+                            const mxnet::Tuple<float>& scales,
                             std::vector<float> *out_anchors) {
   for (size_t j = 0; j < ratios.ndim(); ++j) {
     for (size_t k = 0; k < scales.ndim(); ++k) {
diff --git a/src/operator/contrib/multibox_detection-inl.h b/src/operator/contrib/multibox_detection-inl.h
index 977126ad269d..440037e00d5e 100644
--- a/src/operator/contrib/multibox_detection-inl.h
+++ b/src/operator/contrib/multibox_detection-inl.h
@@ -52,7 +52,7 @@ struct MultiBoxDetectionParam : public dmlc::Parameter<MultiBoxDetectionParam> {
   bool force_suppress;
   int keep_topk;
   int nms_topk;
-  nnvm::Tuple<float> variances;
+  mxnet::Tuple<float> variances;
   DMLC_DECLARE_PARAMETER(MultiBoxDetectionParam) {
     DMLC_DECLARE_FIELD(clip).set_default(true)
     .describe("Clip out-of-boundary boxes.");
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index 8d1082914df7..65fe5f1208bb 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -87,7 +87,7 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
                                      const Tensor<cpu, 3, DType> &temp_space,
                                      const float threshold,
                                      const bool clip,
-                                     const nnvm::Tuple<float> &variances,
+                                     const mxnet::Tuple<float> &variances,
                                      const float nms_threshold,
                                      const bool force_suppress,
                                      const int nms_topk) {
diff --git a/src/operator/contrib/multibox_detection.cu b/src/operator/contrib/multibox_detection.cu
index 98151f8b8755..51b2aa7cdc77 100644
--- a/src/operator/contrib/multibox_detection.cu
+++ b/src/operator/contrib/multibox_detection.cu
@@ -213,7 +213,7 @@ inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType> &out,
                                      const Tensor<gpu, 3, DType> &temp_space,
                                      const float threshold,
                                      const bool clip,
-                                     const nnvm::Tuple<float> &variances,
+                                     const mxnet::Tuple<float> &variances,
                                      const float nms_threshold,
                                      const bool force_suppress,
                                      const int nms_topk) {
diff --git a/src/operator/contrib/multibox_prior-inl.h b/src/operator/contrib/multibox_prior-inl.h
index 3636a6016bd2..654b83fee28f 100644
--- a/src/operator/contrib/multibox_prior-inl.h
+++ b/src/operator/contrib/multibox_prior-inl.h
@@ -57,11 +57,11 @@ enum MultiBoxPriorOpOutputs {kOut};
 }  // namespace mboxprior_enum
 
 struct MultiBoxPriorParam : public dmlc::Parameter<MultiBoxPriorParam> {
-  nnvm::Tuple<float> sizes;
-  nnvm::Tuple<float> ratios;
+  mxnet::Tuple<float> sizes;
+  mxnet::Tuple<float> ratios;
   bool clip;
-  nnvm::Tuple<float> steps;
-  nnvm::Tuple<float> offsets;
+  mxnet::Tuple<float> steps;
+  mxnet::Tuple<float> offsets;
   DMLC_DECLARE_PARAMETER(MultiBoxPriorParam) {
     DMLC_DECLARE_FIELD(sizes).set_default({1.0f})
     .describe("List of sizes of generated MultiBoxPriores.");
diff --git a/src/operator/contrib/multibox_target-inl.h b/src/operator/contrib/multibox_target-inl.h
index f7a92882650c..6034f13ef734 100644
--- a/src/operator/contrib/multibox_target-inl.h
+++ b/src/operator/contrib/multibox_target-inl.h
@@ -62,7 +62,7 @@ struct MultiBoxTargetParam : public dmlc::Parameter<MultiBoxTargetParam> {
   float negative_mining_ratio;
   float negative_mining_thresh;
   int minimum_negative_samples;
-  nnvm::Tuple<float> variances;
+  mxnet::Tuple<float> variances;
   DMLC_DECLARE_PARAMETER(MultiBoxTargetParam) {
     DMLC_DECLARE_FIELD(overlap_threshold).set_default(0.5f)
     .describe("Anchor-GT overlap threshold to be regarded as a positive match.");
diff --git a/src/operator/contrib/multibox_target.cc b/src/operator/contrib/multibox_target.cc
index a1f2aac250ff..a1808c5a7c81 100644
--- a/src/operator/contrib/multibox_target.cc
+++ b/src/operator/contrib/multibox_target.cc
@@ -81,7 +81,7 @@ inline void MultiBoxTargetForward(const Tensor<cpu, 2, DType> &loc_target,
                            const float negative_mining_ratio,
                            const float negative_mining_thresh,
                            const int minimum_negative_samples,
-                           const nnvm::Tuple<float> &variances) {
+                           const mxnet::Tuple<float> &variances) {
   const DType *p_anchor = anchors.dptr_;
   const int num_batches = labels.size(0);
   const int num_labels = labels.size(1);
diff --git a/src/operator/contrib/multibox_target.cu b/src/operator/contrib/multibox_target.cu
index ca0428348a6c..a44c08b08923 100644
--- a/src/operator/contrib/multibox_target.cu
+++ b/src/operator/contrib/multibox_target.cu
@@ -349,7 +349,7 @@ inline void MultiBoxTargetForward(const Tensor<gpu, 2, DType> &loc_target,
                            const float negative_mining_ratio,
                            const float negative_mining_thresh,
                            const int minimum_negative_samples,
-                           const nnvm::Tuple<float> &variances) {
+                           const mxnet::Tuple<float> &variances) {
   const int num_batches = labels.size(0);
   const int num_labels = labels.size(1);
   const int label_width = labels.size(2);
diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h
index 9908ca96ec5f..43905c8793bb 100644
--- a/src/operator/contrib/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -54,8 +54,8 @@ struct ProposalParam : public dmlc::Parameter<ProposalParam> {
   int rpn_post_nms_top_n;
   float threshold;
   int rpn_min_size;
-  nnvm::Tuple<float> scales;
-  nnvm::Tuple<float> ratios;
+  mxnet::Tuple<float> scales;
+  mxnet::Tuple<float> ratios;
   int feature_stride;
   bool output_score;
   bool iou_loss;
@@ -71,10 +71,10 @@ struct ProposalParam : public dmlc::Parameter<ProposalParam> {
     DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
     .describe("Minimum height or width in proposal");
     tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f;
-    DMLC_DECLARE_FIELD(scales).set_default(nnvm::Tuple<float>(tmp, tmp + 4))
+    DMLC_DECLARE_FIELD(scales).set_default(mxnet::Tuple<float>(tmp, tmp + 4))
     .describe("Used to generate anchor windows by enumerating scales");
     tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f;
-    DMLC_DECLARE_FIELD(ratios).set_default(nnvm::Tuple<float>(tmp, tmp + 3))
+    DMLC_DECLARE_FIELD(ratios).set_default(mxnet::Tuple<float>(tmp, tmp + 3))
     .describe("Used to generate anchor windows by enumerating ratios");
     DMLC_DECLARE_FIELD(feature_stride).set_default(16)
     .describe("The size of the receptive field each unit in the convolution layer of the rpn,"
@@ -212,8 +212,8 @@ inline void _Transform(float scale,
 
 // out_anchors must have shape (n, 5), where n is ratios.size() * scales.size()
 inline void GenerateAnchors(const std::vector<float>& base_anchor,
-                            const nnvm::Tuple<float>& ratios,
-                            const nnvm::Tuple<float>& scales,
+                            const mxnet::Tuple<float>& ratios,
+                            const mxnet::Tuple<float>& scales,
                             std::vector<float> *out_anchors) {
   for (size_t j = 0; j < ratios.ndim(); ++j) {
     for (size_t k = 0; k < scales.ndim(); ++k) {
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 87da1f9871f9..d4e81fdfcb0a 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -37,11 +37,11 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   int num_outputs;
   int num_out_data;
   // The location of states in the subgraph inputs.
-  nnvm::Tuple<dim_t> in_state_locs;
+  mxnet::Tuple<dim_t> in_state_locs;
   // The location of data arrays in the subgraph inputs.
-  nnvm::Tuple<dim_t> in_data_locs;
+  mxnet::Tuple<dim_t> in_data_locs;
   // The location of remaining arrays in the subgraph inputs.
-  nnvm::Tuple<dim_t> remain_locs;
+  mxnet::Tuple<dim_t> remain_locs;
   DMLC_DECLARE_PARAMETER(ForeachParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs.");
@@ -258,7 +258,7 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
 
 template<typename T>
 static void remap(const std::vector<T> &op_in, size_t start,
-                  const nnvm::Tuple<dim_t> &locs, std::vector<T> *subg_in) {
+                  const mxnet::Tuple<dim_t> &locs, std::vector<T> *subg_in) {
   auto op_in_it = op_in.begin() + start;
   for (size_t i = 0; i < locs.ndim(); i++) {
     dim_t loc = locs[i];
@@ -488,9 +488,9 @@ struct WhileLoopParam : public dmlc::Parameter<WhileLoopParam> {
   // `cond_input_locs' contains indices of inputs fed to `cond', and
   // `func_input_locs' contains indices of inputs fed to `func'.
   // `func_var_locs' are indices in which input "variables" are stored in func's inputs.
-  nnvm::Tuple<dim_t> cond_input_locs;
-  nnvm::Tuple<dim_t> func_input_locs;
-  nnvm::Tuple<dim_t> func_var_locs;
+  mxnet::Tuple<dim_t> cond_input_locs;
+  mxnet::Tuple<dim_t> func_input_locs;
+  mxnet::Tuple<dim_t> func_var_locs;
   DMLC_DECLARE_PARAMETER(WhileLoopParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
     .describe("Number of input arguments, including cond and func as two symbol inputs.");
@@ -538,9 +538,9 @@ class WhileLoopState: public LoopState {
                  n_iterations(0U),
                  cond_op(LoopState::MakeSharedOp(cond)),
                  oi_map(params.func_var_locs.ndim(), -1) {
-    const nnvm::Tuple<dim_t> &func_input_locs = params.func_input_locs;
-    const nnvm::Tuple<dim_t> &func_var_locs = params.func_var_locs;
-    const nnvm::Tuple<dim_t> &cond_input_locs = params.cond_input_locs;
+    const mxnet::Tuple<dim_t> &func_input_locs = params.func_input_locs;
+    const mxnet::Tuple<dim_t> &func_var_locs = params.func_var_locs;
+    const mxnet::Tuple<dim_t> &cond_input_locs = params.cond_input_locs;
     for (size_t i = 0; i < func_var_locs.ndim(); ++i) {
       dim_t pos_i = func_input_locs[func_var_locs[i]];
       for (size_t j = 0; j < cond_input_locs.ndim(); ++j) {
@@ -740,7 +740,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
   // infer shape for cond and func
   auto infer_subg = [&params, in_shape, out_shape](std::shared_ptr<Symbol> subg,
                                                    ShapeVector *_subg_out,
-                                                   const nnvm::Tuple<dim_t> &input_locs,
+                                                   const mxnet::Tuple<dim_t> &input_locs,
                                                    int num_out_data,
                                                    bool fill_out_shape) {
     // create subg_in
@@ -915,9 +915,9 @@ WhileLoopGradient(const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& og
 struct CondParam : public dmlc::Parameter<CondParam> {
   int num_args;
   int num_outputs;
-  nnvm::Tuple<dim_t> cond_input_locs;
-  nnvm::Tuple<dim_t> then_input_locs;
-  nnvm::Tuple<dim_t> else_input_locs;
+  mxnet::Tuple<dim_t> cond_input_locs;
+  mxnet::Tuple<dim_t> then_input_locs;
+  mxnet::Tuple<dim_t> else_input_locs;
   DMLC_DECLARE_PARAMETER(CondParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(3)
     .describe("Number of input arguments, including cond, then and else as three symbol inputs.");
@@ -992,7 +992,7 @@ static void CondComputeExCPU(const OpStatePtr& state_ptr,
   state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr);
   branch_selection = as_bool_scalar(*cond_output_ptr[0]);
   // select the right branch
-  const nnvm::Tuple<dim_t> &func_input_locs = branch_selection
+  const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
                                             ? params.then_input_locs
                                             : params.else_input_locs;
   LoopState &loop_state = branch_selection
@@ -1017,7 +1017,7 @@ static void CondGradComputeExCPU(const OpStatePtr& state_ptr,
   // select the right branch
   int branch_selection = state.branch_selection;
   CHECK_NE(branch_selection, -1);
-  const nnvm::Tuple<dim_t> &func_input_locs = branch_selection
+  const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
                                             ? params.then_input_locs
                                             : params.else_input_locs;
   LoopState &loop_state = branch_selection
@@ -1048,7 +1048,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
   // infer shape for cond, then and else
   auto infer_subg = [&params, in_shape, out_shape](std::shared_ptr<Symbol> subg,
                                                    ShapeVector *_subg_out,
-                                                   const nnvm::Tuple<dim_t> &input_locs,
+                                                   const mxnet::Tuple<dim_t> &input_locs,
                                                    bool fill_out_shape) {
     // create subg_in
     mxnet::ShapeVector subg_in;
@@ -1190,7 +1190,7 @@ static bool BackwardCondStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size() + 3U, (size_t) params.num_args);
   CHECK_EQ(attrs.subgraphs.size(), 3U);
   static const std::function<bool(const int &)> is_udf = is_stype_udf;
-  auto sub_pass = [&](const std::shared_ptr<Symbol> &subg, const nnvm::Tuple<dim_t> &input_locs) {
+  auto sub_pass = [&](const std::shared_ptr<Symbol> &subg, const mxnet::Tuple<dim_t> &input_locs) {
     // A. first construct subg_in_attrs
     // need subg_in_attrs as subg_bwd_out (copy), subg_fwd_in (extract), subg_fwd_out (copy)
     std::vector<int> subg_in_attrs;
diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index c37324678120..9d36f9838033 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -215,16 +215,16 @@ void ToTensorOpForward(const nnvm::NodeAttrs &attrs,
 }
 
 struct NormalizeParam : public dmlc::Parameter<NormalizeParam> {
-  nnvm::Tuple<float> mean;
-  nnvm::Tuple<float> std;
+  mxnet::Tuple<float> mean;
+  mxnet::Tuple<float> std;
 
   DMLC_DECLARE_PARAMETER(NormalizeParam) {
     DMLC_DECLARE_FIELD(mean)
-    .set_default(nnvm::Tuple<float> {0.0f, 0.0f, 0.0f, 0.0f})
+    .set_default(mxnet::Tuple<float> {0.0f, 0.0f, 0.0f, 0.0f})
     .describe("Sequence of means for each channel. "
               "Default value is 0.");
     DMLC_DECLARE_FIELD(std)
-    .set_default(nnvm::Tuple<float> {1.0f, 1.0f, 1.0f, 1.0f})
+    .set_default(mxnet::Tuple<float> {1.0f, 1.0f, 1.0f, 1.0f})
     .describe("Sequence of standard deviations for each channel. "
               "Default value is 1.");
   }
@@ -981,7 +981,7 @@ inline void RandomColorJitter(const nnvm::NodeAttrs &attrs,
 }
 
 struct AdjustLightingParam : public dmlc::Parameter<AdjustLightingParam> {
-  nnvm::Tuple<float> alpha;
+  mxnet::Tuple<float> alpha;
   DMLC_DECLARE_PARAMETER(AdjustLightingParam) {
     DMLC_DECLARE_FIELD(alpha)
     .describe("The lighting alphas for the R, G, B channels.");
@@ -997,7 +997,7 @@ struct RandomLightingParam : public dmlc::Parameter<RandomLightingParam> {
   }
 };
 
-inline void AdjustLightingImpl(const nnvm::Tuple<float>& alpha,
+inline void AdjustLightingImpl(const mxnet::Tuple<float>& alpha,
                         const OpContext &ctx,
                         const std::vector<TBlob> &inputs,
                         const std::vector<OpReqType> &req,
diff --git a/src/operator/image/resize-inl.h b/src/operator/image/resize-inl.h
index de2189838d76..4ebebbfb272c 100644
--- a/src/operator/image/resize-inl.h
+++ b/src/operator/image/resize-inl.h
@@ -49,12 +49,12 @@ void ResizeImplCUDA(Stream<gpu> *s,
 #endif  // MXNET_USE_CUDA
 
 struct ResizeParam : public dmlc::Parameter<ResizeParam> {
-  nnvm::Tuple<int> size;
+  mxnet::Tuple<int> size;
   bool keep_ratio;
   int interp;
   DMLC_DECLARE_PARAMETER(ResizeParam) {
     DMLC_DECLARE_FIELD(size)
-    .set_default(nnvm::Tuple<int>())
+    .set_default(mxnet::Tuple<int>())
     .describe("Size of new image. Could be (width, height) or (size)");
     DMLC_DECLARE_FIELD(keep_ratio)
     .describe("Whether to resize the short edge or both edges to `size`, "
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 49eb96b9f8b2..bd923aebbb80 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -83,8 +83,8 @@ struct SGDParam : public dmlc::Parameter<SGDParam> {
 };
 
 struct MultiSGDParam : public dmlc::Parameter<MultiSGDParam> {
-  nnvm::Tuple<float> lrs;
-  nnvm::Tuple<float> wds;
+  mxnet::Tuple<float> lrs;
+  mxnet::Tuple<float> wds;
   float rescale_grad;
   float clip_gradient;
   int num_weights;
@@ -110,8 +110,8 @@ struct MultiSGDParam : public dmlc::Parameter<MultiSGDParam> {
 };
 
 struct MultiSGDMomParam : public dmlc::Parameter<MultiSGDMomParam> {
-  nnvm::Tuple<float> lrs;
-  nnvm::Tuple<float> wds;
+  mxnet::Tuple<float> lrs;
+  mxnet::Tuple<float> wds;
   float momentum;
   float rescale_grad;
   float clip_gradient;
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 91adf576dc07..3147f873277d 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -67,7 +67,7 @@ bool is_type_udf(const int &x);
 
 template <typename T>
 void extract_by_loc(const std::vector<T> &array,
-                    const nnvm::Tuple<dim_t> input_locs,
+                    const mxnet::Tuple<dim_t> input_locs,
                     std::vector<T> *out) {
   out->clear();
   out->reserve(input_locs.ndim());
@@ -94,7 +94,7 @@ bool fill_value(T *x, T *y, bool x_empty, bool y_empty) {
 }
 
 template <typename T>
-bool sync_in_in(const nnvm::Tuple<dim_t> &input_locs,
+bool sync_in_in(const mxnet::Tuple<dim_t> &input_locs,
                          std::vector<T> *in,
                          std::vector<T> *subg_in,
                          std::function<bool(const T &)> is_empty) {
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index 41cb940d957a..b17a81f75bc6 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -69,7 +69,7 @@ class SwapAxisOp : public Operator {
 
   void Reshape2Five(mshadow::Shape<5> *inter_shape,
                     const mxnet::TShape &shape,
-                    uint32_t dim1, uint32_t dim2) {
+                    int dim1, int dim2) {
     using namespace mshadow;
     using namespace mshadow::expr;
     int ndim_in = shape.ndim();
@@ -106,8 +106,8 @@ class SwapAxisOp : public Operator {
                 const std::vector<OpReqType> &req) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    uint32_t dim1 = param_.dim1;
-    uint32_t dim2 = param_.dim2;
+    int dim1 = param_.dim1;
+    int dim2 = param_.dim2;
 
     TBlob data_in = in_data[swapaxisenum::kData];
     TBlob data_out = out_data[swapaxisenum::kData];
diff --git a/src/operator/tensor/histogram-inl.h b/src/operator/tensor/histogram-inl.h
index 51d0bdb6c2b6..00b6168c39b8 100644
--- a/src/operator/tensor/histogram-inl.h
+++ b/src/operator/tensor/histogram-inl.h
@@ -46,13 +46,13 @@ namespace op {
 
 struct HistogramParam : public dmlc::Parameter<HistogramParam> {
     dmlc::optional<int> bin_cnt;
-    dmlc::optional<nnvm::Tuple<double>> range;
+    dmlc::optional<mxnet::Tuple<double>> range;
     DMLC_DECLARE_PARAMETER(HistogramParam) {
       DMLC_DECLARE_FIELD(bin_cnt)
         .set_default(dmlc::optional<int>())
         .describe("Number of bins for uniform case");
       DMLC_DECLARE_FIELD(range)
-        .set_default(dmlc::optional<nnvm::Tuple<double>>())
+        .set_default(dmlc::optional<mxnet::Tuple<double>>())
         .describe("The lower and upper range of the bins. if not provided, "
                   "range is simply (a.min(), a.max()). values outside the "
                   "range are ignored. the first element of the range must be "
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index adfec893c50c..c8ce06a67ee5 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -49,11 +49,11 @@ namespace op {
 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   mxnet::TShape target_shape;
   bool keep_highest;
-  nnvm::Tuple<int> shape;
+  mxnet::Tuple<int> shape;
   bool reverse;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
     DMLC_DECLARE_FIELD(shape)
-    .set_default(nnvm::Tuple<int>())
+    .set_default(mxnet::Tuple<int>())
     .describe("The target shape");
     DMLC_DECLARE_FIELD(reverse)
     .set_default(false)
@@ -71,7 +71,7 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
 };
 
 template<typename IType>
-inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
+inline mxnet::TShape InferReshapeShape(const mxnet::Tuple<IType>& shape,
                                 const mxnet::TShape& dshape, bool reverse) {
   std::vector<IType> dshape_vec;
   std::vector<IType> param_shape_vec(shape.begin(), shape.end());
@@ -634,9 +634,9 @@ void SliceEx(const nnvm::NodeAttrs& attrs,
 
 template<int ndim>
 inline void GetIndexRange(const mxnet::TShape& dshape,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_begin,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_end,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_step,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_begin,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_end,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_step,
                           common::StaticArray<index_t, ndim>* begin,
                           common::StaticArray<index_t, ndim>* end,
                           common::StaticArray<index_t, ndim>* step) {
@@ -997,8 +997,8 @@ void SliceAssignOpForward(const nnvm::NodeAttrs& attrs,
 
 struct SliceAssignScalarParam : public dmlc::Parameter<SliceAssignScalarParam> {
   double scalar;
-  nnvm::Tuple<dmlc::optional<int>> begin, end;
-  nnvm::Tuple<dmlc::optional<int>> step;
+  mxnet::Tuple<dmlc::optional<int>> begin, end;
+  mxnet::Tuple<dmlc::optional<int>> step;
   DMLC_DECLARE_PARAMETER(SliceAssignScalarParam) {
     DMLC_DECLARE_FIELD(scalar)
     .set_default(0)
@@ -1008,7 +1008,7 @@ struct SliceAssignScalarParam : public dmlc::Parameter<SliceAssignScalarParam> {
     DMLC_DECLARE_FIELD(end)
     .describe("ending indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(step)
-    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .set_default(mxnet::Tuple<dmlc::optional<int>>())
     .describe("step for the slice operation, supports negative values.");
   }
 };
@@ -1305,9 +1305,9 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
 inline void SliceLikeInferRanges(const mxnet::TShape& dshape,
                                  const mxnet::TShape& fshape,
                                  const mxnet::TShape& axes,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_begin,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_end,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_step) {
+                                 mxnet::Tuple<dmlc::optional<int>>* param_begin,
+                                 mxnet::Tuple<dmlc::optional<int>>* param_end,
+                                 mxnet::Tuple<dmlc::optional<int>>* param_step) {
   std::vector<dmlc::optional<int>> pb(dshape.ndim());
   std::vector<dmlc::optional<int>> pe(dshape.ndim());
   std::vector<dmlc::optional<int>> ps(dshape.ndim());
@@ -1334,9 +1334,9 @@ inline void SliceLikeInferRanges(const mxnet::TShape& dshape,
       ps[axis] = 1;
     }
   }
-  *param_begin = nnvm::Tuple<dmlc::optional<int>>(pb.begin(), pb.end());
-  *param_end = nnvm::Tuple<dmlc::optional<int>>(pe.begin(), pe.end());
-  *param_step = nnvm::Tuple<dmlc::optional<int>>(ps.begin(), ps.end());
+  *param_begin = mxnet::Tuple<dmlc::optional<int>>(pb.begin(), pb.end());
+  *param_end = mxnet::Tuple<dmlc::optional<int>>(pe.begin(), pe.end());
+  *param_step = mxnet::Tuple<dmlc::optional<int>>(ps.begin(), ps.end());
 }
 
 template<typename xpu>
@@ -1355,9 +1355,9 @@ void SliceLikeForward(const nnvm::NodeAttrs& attrs,
   const TBlob& out = outputs[0];
   const mxnet::TShape& ishape = data.shape_;
   const mxnet::TShape& from_shape = inputs[1].shape_;
-  nnvm::Tuple<dmlc::optional<int>> param_begin;
-  nnvm::Tuple<dmlc::optional<int>> param_end;
-  nnvm::Tuple<dmlc::optional<int>> param_step;
+  mxnet::Tuple<dmlc::optional<int>> param_begin;
+  mxnet::Tuple<dmlc::optional<int>> param_end;
+  mxnet::Tuple<dmlc::optional<int>> param_step;
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
@@ -1403,9 +1403,9 @@ void SliceLikeBackward(const nnvm::NodeAttrs& attrs,
 
   const mxnet::TShape& ishape = ograd.shape_;
   const mxnet::TShape& from_shape = outputs[1].shape_;
-  nnvm::Tuple<dmlc::optional<int>> param_begin;
-  nnvm::Tuple<dmlc::optional<int>> param_end;
-  nnvm::Tuple<dmlc::optional<int>> param_step;
+  mxnet::Tuple<dmlc::optional<int>> param_begin;
+  mxnet::Tuple<dmlc::optional<int>> param_end;
+  mxnet::Tuple<dmlc::optional<int>> param_step;
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
@@ -1919,7 +1919,7 @@ void TileOpBackward(const nnvm::NodeAttrs& attrs,
 }
 
 struct ReverseParam : public dmlc::Parameter<ReverseParam> {
-  nnvm::Tuple<int> axis;
+  mxnet::Tuple<int> axis;
   DMLC_DECLARE_PARAMETER(ReverseParam) {
     DMLC_DECLARE_FIELD(axis)
     .describe("The axis which to reverse elements.");
diff --git a/src/operator/tensor/slice-inl.h b/src/operator/tensor/slice-inl.h
index 4e94cbeda46c..78a2bd8c7b45 100644
--- a/src/operator/tensor/slice-inl.h
+++ b/src/operator/tensor/slice-inl.h
@@ -34,15 +34,15 @@ namespace mxnet {
 namespace op {
 
 struct SliceParam : public dmlc::Parameter<SliceParam> {
-  nnvm::Tuple<dmlc::optional<int>> begin, end;
-  nnvm::Tuple<dmlc::optional<int>> step;
+  mxnet::Tuple<dmlc::optional<int>> begin, end;
+  mxnet::Tuple<dmlc::optional<int>> step;
   DMLC_DECLARE_PARAMETER(SliceParam) {
     DMLC_DECLARE_FIELD(begin)
     .describe("starting indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(end)
     .describe("ending indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(step)
-    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .set_default(mxnet::Tuple<dmlc::optional<int>>())
     .describe("step for the slice operation, supports negative values.");
   }
   bool operator==(const SliceParam& other) const {

From 0758d0c5c072c222c928af11378a910b4d41236d Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 16 Apr 2019 12:10:36 -0700
Subject: [PATCH 14/30] fix compiler warning

---
 src/operator/contrib/proposal-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h
index 63bc4055b6b9..10f1f86806e4 100644
--- a/src/operator/contrib/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -215,8 +215,8 @@ inline void GenerateAnchors(const std::vector<float>& base_anchor,
                             const mxnet::Tuple<float>& ratios,
                             const mxnet::Tuple<float>& scales,
                             std::vector<float> *out_anchors) {
-  for (size_t j = 0; j < ratios.ndim(); ++j) {
-    for (size_t k = 0; k < scales.ndim(); ++k) {
+  for (int j = 0; j < ratios.ndim(); ++j) {
+    for (int k = 0; k < scales.ndim(); ++k) {
       _Transform(scales[k], ratios[j], base_anchor, out_anchors);
     }
   }

From a503ec5df210c2d906871985a321a15098d88280 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 16 Apr 2019 12:23:28 -0700
Subject: [PATCH 15/30] fix compiler warning

---
 src/operator/tensor/matrix_op-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index e9161d1fa3d3..e99741b70bb6 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -736,7 +736,7 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
-    for (uint32_t i = 0; i < param.begin.ndim(); ++i) {
+    for (int i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &oshape);
     }

From cd9aa536c07845f651b2c5a9b577fb1fc2606b4a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 16 Apr 2019 12:43:17 -0700
Subject: [PATCH 16/30] fix compiler warning

---
 src/operator/contrib/multi_proposal-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h
index 76186d446593..7010dadfedbc 100644
--- a/src/operator/contrib/multi_proposal-inl.h
+++ b/src/operator/contrib/multi_proposal-inl.h
@@ -217,8 +217,8 @@ inline void GenerateAnchors(const std::vector<float>& base_anchor,
                             const mxnet::Tuple<float>& ratios,
                             const mxnet::Tuple<float>& scales,
                             std::vector<float> *out_anchors) {
-  for (size_t j = 0; j < ratios.ndim(); ++j) {
-    for (size_t k = 0; k < scales.ndim(); ++k) {
+  for (int j = 0; j < ratios.ndim(); ++j) {
+    for (int k = 0; k < scales.ndim(); ++k) {
       _Transform(scales[k], ratios[j], base_anchor, out_anchors);
     }
   }

From 12559b1dc1c0acf61d25e1bb8a40e03bd838088b Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 16 Apr 2019 20:06:43 +0000
Subject: [PATCH 17/30] fix compiler warning

---
 src/operator/image/image_random-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index 22566062e131..aeb189f35b78 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -245,7 +245,7 @@ inline bool NormalizeOpShape(const nnvm::NodeAttrs& attrs,
       << "Input tensor must have shape (channels, height, width), or "
       << "(N, channels, height, width), but got " << dshape;
 
-  uint32_t nchannels;
+  int nchannels = 0;
   if (dshape.ndim() == 3) {
     nchannels = dshape[0];
     CHECK(nchannels == 3 || nchannels == 1)

From a4e4a0c795b9e3e1e9c9bb68202e3b8c00e2bec7 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 16 Apr 2019 20:59:18 +0000
Subject: [PATCH 18/30] fix compiler warning

---
 src/operator/tensor/square_sum-inl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h
index 016b383117bc..269ffd15f394 100644
--- a/src/operator/tensor/square_sum-inl.h
+++ b/src/operator/tensor/square_sum-inl.h
@@ -434,14 +434,14 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
                             " when ograd_stype = kRowSparseStorage";
     CHECK_EQ(ograd.shape().ndim(), 2U);
     const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx);
-    CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == in_data.shape_[0]);
+    CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == static_cast<size_t>(in_data.shape_[0]));
     igrad->CheckAndAlloc({ograd.aux_shape(rowsparse::kIdx)});
     const TBlob& igrad_data = igrad->data();
     const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
     MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
       // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
       // ograd_row_idx and in_row_idx are expected to have the same elements
-      if (in_row_idx.Size() != input.shape()[0]) {  // if input data is not a full rsp
+      if (in_row_idx.Size() != static_cast<size_t>(input.shape()[0])) {  // if input data is not a full rsp
         CHECK_EQ(ograd_row_idx.Size(), in_row_idx.Size()) << "SquareSumRspGradImpl only supports"
                                                              " equal ograd_row_idx and"
                                                              " input_row_idx when ograd and"
@@ -452,7 +452,7 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
       }
       MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
-          if (in_row_idx.Size() != input.shape()[0]) {  // input data is not a full rsp
+          if (in_row_idx.Size() != static_cast<size_t>(input.shape()[0])) {  // input data is not a full rsp
             Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage, false>, xpu>::Launch(
                 s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
                 igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),

From 2399864009c6e5aa222cae09b122f8735a6f108a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 16 Apr 2019 17:39:59 -0700
Subject: [PATCH 19/30] fix lint

---
 src/operator/tensor/square_sum-inl.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h
index 269ffd15f394..c2e3182c6a1e 100644
--- a/src/operator/tensor/square_sum-inl.h
+++ b/src/operator/tensor/square_sum-inl.h
@@ -434,14 +434,16 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
                             " when ograd_stype = kRowSparseStorage";
     CHECK_EQ(ograd.shape().ndim(), 2U);
     const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx);
-    CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == static_cast<size_t>(in_data.shape_[0]));
+    CHECK(ograd_row_idx.Size() == in_row_idx.Size() ||
+          in_row_idx.Size() == static_cast<size_t>(in_data.shape_[0]));
     igrad->CheckAndAlloc({ograd.aux_shape(rowsparse::kIdx)});
     const TBlob& igrad_data = igrad->data();
     const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
     MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
       // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
       // ograd_row_idx and in_row_idx are expected to have the same elements
-      if (in_row_idx.Size() != static_cast<size_t>(input.shape()[0])) {  // if input data is not a full rsp
+      if (in_row_idx.Size() != static_cast<size_t>(input.shape()[0])) {
+          // if input data is not a full rsp
         CHECK_EQ(ograd_row_idx.Size(), in_row_idx.Size()) << "SquareSumRspGradImpl only supports"
                                                              " equal ograd_row_idx and"
                                                              " input_row_idx when ograd and"
@@ -452,7 +454,8 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
       }
       MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
-          if (in_row_idx.Size() != static_cast<size_t>(input.shape()[0])) {  // input data is not a full rsp
+          if (in_row_idx.Size() != static_cast<size_t>(input.shape()[0])) {
+              // input data is not a full rsp
             Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage, false>, xpu>::Launch(
                 s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
                 igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),

From 334d77515b292dce6f8224d633e9de6208a31804 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 16 Apr 2019 22:50:50 -0700
Subject: [PATCH 20/30] update CI runtime_functons

---
 ci/docker/runtime_functions.sh | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index a89c51de0d8e..72107477cc51 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -755,6 +755,40 @@ build_ubuntu_gpu_cmake() {
     ninja -v
 }
 
+build_ubuntu_cpu_large_tensor() {
+    set -ex
+
+    build_ccache_wrappers
+
+    make  \
+        DEV=1                         \
+        ENABLE_TESTCOVERAGE=1         \
+        USE_CPP_PACKAGE=1             \
+        USE_BLAS=openblas             \
+        USE_INT64_TENSOR_SIZE=1       \
+        USE_SIGNAL_HANDLER=1          \
+        -j$(nproc)
+}
+
+build_ubuntu_gpu_large_tensor() {
+    set -ex
+
+    build_ccache_wrappers
+
+    make  \
+        DEV=1                                     \
+        ENABLE_TESTCOVERAGE=1                     \
+        USE_CPP_PACKAGE=1                         \
+        USE_BLAS=openblas                         \
+        USE_CUDA=1                                \
+        USE_CUDA_PATH=/usr/local/cuda             \
+        USE_CUDNN=1                               \
+        CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
+        USE_INT64_TENSOR_SIZE=1                   \
+        USE_SIGNAL_HANDLER=1                      \
+        -j$(nproc)
+}
+
 build_ubuntu_blc() {
     echo "pass"
 }

From 826613a521050f1b9b78ebc64bde7e07d065da86 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 17 Apr 2019 10:01:05 -0700
Subject: [PATCH 21/30] update runtime function

---
 ci/docker/runtime_functions.sh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 72107477cc51..959554e2e772 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -757,14 +757,14 @@ build_ubuntu_gpu_cmake() {
 
 build_ubuntu_cpu_large_tensor() {
     set -ex
-
+    cd /work/build
     build_ccache_wrappers
-
+    export CC="ccache gcc"
+    export CXX="ccache g++"
     make  \
         DEV=1                         \
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
-        USE_BLAS=openblas             \
         USE_INT64_TENSOR_SIZE=1       \
         USE_SIGNAL_HANDLER=1          \
         -j$(nproc)
@@ -772,14 +772,12 @@ build_ubuntu_cpu_large_tensor() {
 
 build_ubuntu_gpu_large_tensor() {
     set -ex
-
+    cd /work/build
     build_ccache_wrappers
-
     make  \
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
         USE_CPP_PACKAGE=1                         \
-        USE_BLAS=openblas                         \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \

From 4412b90047b808b66029ba22af914f08b2086d8e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 17 Apr 2019 10:25:52 -0700
Subject: [PATCH 22/30] correct runtime_functions

---
 ci/docker/runtime_functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 959554e2e772..bfe56a8083eb 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -757,7 +757,6 @@ build_ubuntu_gpu_cmake() {
 
 build_ubuntu_cpu_large_tensor() {
     set -ex
-    cd /work/build
     build_ccache_wrappers
     export CC="ccache gcc"
     export CXX="ccache g++"
@@ -772,7 +771,6 @@ build_ubuntu_cpu_large_tensor() {
 
 build_ubuntu_gpu_large_tensor() {
     set -ex
-    cd /work/build
     build_ccache_wrappers
     make  \
         DEV=1                                     \
@@ -1215,6 +1213,8 @@ nightly_test_KVStore_singleNode() {
     python tests/nightly/test_kvstore.py
 }
 
+#Test
+
 #Tests Amalgamation Build with 5 different sets of flags
 nightly_test_amalgamation() {
     set -ex

From 1047eb50e359b93eab8ef035e2f06001de47099f Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 17 Apr 2019 18:27:41 +0000
Subject: [PATCH 23/30] udpate runtime functions

---
 ci/docker/runtime_functions.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index bfe56a8083eb..0dec50641106 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -775,7 +775,6 @@ build_ubuntu_gpu_large_tensor() {
     make  \
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
-        USE_CPP_PACKAGE=1                         \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \

From 97a1c082476d448ef10ea809a783a2aa2bc7d521 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 17 Apr 2019 11:35:21 -0700
Subject: [PATCH 24/30] add nightly test for large tensor

---
 ci/docker/runtime_functions.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 0dec50641106..c61e49b4eb96 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1212,7 +1212,12 @@ nightly_test_KVStore_singleNode() {
     python tests/nightly/test_kvstore.py
 }
 
-#Test
+#Test Large Tensor Size
+nightly_test_large_tensor() {
+    set -ex
+    export PYTHONPATH=./python/
+    python tests/nightly/test_large_array.py
+}
 
 #Tests Amalgamation Build with 5 different sets of flags
 nightly_test_amalgamation() {

From 861b95e25a09eeb90cb75eadd487c4dd70486722 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 17 Apr 2019 11:45:10 -0700
Subject: [PATCH 25/30] update Jenkins files to test new compiler flag

---
 ci/jenkins/Jenkins_steps.groovy      | 28 ++++++++++++++++++++++++++++
 ci/jenkins/Jenkinsfile_unix_cpu      |  3 ++-
 ci/jenkins/Jenkinsfile_unix_gpu      |  1 +
 tests/nightly/JenkinsfileForBinaries | 16 ++++++++++++++++
 tests/nightly/test_large_array.py    | 28 +++++++++++++++++++++++++---
 5 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 5b9ad47f6afb..0f648c694b59 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -119,6 +119,34 @@ def compile_unix_openblas_debug_cpu() {
     }]
 }
 
+def compile_unix_int64_cpu() {
+    return ['CPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cpu-int64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_large_tensor', false)
+            utils.pack_lib('cpu', mx_lib, true)
+          }
+        }
+      }
+    }]
+}
+
+def compile_unix_int64_gpu() {
+    return ['GPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/build-gpu-int64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_large_tensor', false)
+            utils.pack_lib('gpu', mx_lib, true)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_mkl_cpu() {
     return ['CPU: MKL': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index 919381ebccd4..fa0942988d9c 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -38,7 +38,8 @@ core_logic: {
     custom_steps.compile_unix_openblas_debug_cpu(),
     custom_steps.compile_unix_mkl_cpu(),
     custom_steps.compile_unix_mkldnn_cpu(),
-    custom_steps.compile_unix_mkldnn_mkl_cpu()
+    custom_steps.compile_unix_mkldnn_mkl_cpu(),
+    custom_steps.compile_unix_int64_cpu()
   ])
 
   utils.parallel_stage('Tests', [
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index f6191deb7a68..fd8b4e2000b9 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -40,6 +40,7 @@ core_logic: {
     custom_steps.compile_unix_cmake_mkldnn_gpu(),
     custom_steps.compile_unix_cmake_gpu(),
     custom_steps.compile_unix_tensorrt_gpu(),
+    custom_steps.compile_unix_int64_gpu()
   ]) 
 
   utils.parallel_stage('Tests', [
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index 53e1c30e188f..9625d707211b 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -59,6 +59,22 @@ core_logic: {
         }
       }
     },
+    'Test Large Tensor Size: CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/large_tensor-cpu') {
+            utils.unpack_and_init('cpu', mx_lib)
+            utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_large_tensor USE_INT64_TENSOR_SIZE=1', false)
+        }
+      }
+    }
+    'Test Large Tensor Size: GPU': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/large_tensor-gpu') {
+            utils.unpack_and_init('gpu', mx_lib)
+            utils.docker_run('ubuntu_nightly_gpu', 'nightly_test_large_tensor USE_INT64_TENSOR_SIZE=1', false)
+        }
+      }
+    }
     'StraightDope: Python2 Single-GPU': {
       node(NODE_LINUX_GPU_P3) {
         ws('workspace/straight_dope-single_gpu') {
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index a627467cb959..1b7dad487a68 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -27,6 +27,7 @@
 SMALL_Y = 50
 LARGE_SIZE = LARGE_X * SMALL_Y
 
+
 def test_gluon_embedding():
     m = gluon.nn.Embedding(SMALL_Y, MEDIUM_X)
     m.initialize()
@@ -35,22 +36,26 @@ def test_gluon_embedding():
     assert b.shape == (MEDIUM_X, SMALL_Y, MEDIUM_X)
     assert b.asnumpy().size == LARGE_SIZE
 
+
 def test_ndarray_zeros():
     a = nd.zeros(shape=(LARGE_X, SMALL_Y))
     assert a[-1][0] == 0
     assert a.shape == (LARGE_X, SMALL_Y)
     assert a.size == LARGE_SIZE
 
+
 def test_ndarray_ones():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     assert a[-1][0] == 1
     assert nd.sum(a).asnumpy() == LARGE_SIZE
 
+
 @with_seed()
 def test_ndarray_random_uniform():
     a = nd.random.uniform(shape=(LARGE_X, SMALL_Y))
     assert a[-1][0] != 0
 
+
 @with_seed()
 def test_ndarray_random_randint():
     a = nd.random.randint(100, 10000, shape=(LARGE_X, SMALL_Y))
@@ -59,14 +64,16 @@ def test_ndarray_random_randint():
     low_large_value = 2**32
     high_large_value = 2**34
     a = nd.random.randint(low_large_value,high_large_value)
-    low = mx.nd.array([low_large_value],dtype='int64')
-    high = mx.nd.array([high_large_value],dtype='int64')
+    low = mx.nd.array([low_large_value], dtype='int64')
+    high = mx.nd.array([high_large_value], dtype='int64')
     assert a.__gt__(low) & a.__lt__(high)
 
+
 def test_ndarray_empty():
     a = nd.empty((LARGE_X, SMALL_Y))
     assert a.shape == (LARGE_X, SMALL_Y)
 
+
 def test_elementwise():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.ones(shape=(LARGE_X, SMALL_Y))
@@ -77,22 +84,26 @@ def test_elementwise():
     res = nd.sqrt(a + 3)
     assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
 
+
 def test_reduce():
     a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
     assert nd.sum(a).asnumpy() == a.shape[0] * a.shape[1]
 
+
 def test_dot():
     a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
     b = nd.ones(shape=(SMALL_Y, SMALL_Y))
     res = nd.dot(a, b)
     assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
 
+
 def test_FullyConnected():
     a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
     b = nd.ones(shape=(SMALL_Y, SMALL_Y)) 
     res = nd.FullyConnected(a, b, num_hidden=b.shape[1], no_bias=True)
     assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
 
+
 def test_broadcast():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
@@ -101,45 +112,53 @@ def test_broadcast():
     res = mx.nd.broadcast_like(b, a)
     assert np.sum(res[-1].asnumpy() == LARGE_X) == a.shape[1]
 
+
 def test_clip():
     a = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
     b = nd.broadcast_to(a, shape=(a.shape[0], SMALL_Y))
     res = nd.clip(b, a_min=100, a_max=1000)
     assert np.sum(res[-1].asnumpy() == 1000) == b.shape[1]
 
+
 def test_take():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     idx = nd.arange(LARGE_X-1000, LARGE_X)
     res = nd.take(a, idx)
     assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
 
+
 def test_slice():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     res = nd.slice(a, begin=(LARGE_X-1000, 1), end=(LARGE_X, SMALL_Y))
     assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
 
+
 def test_slice_assign():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     a[LARGE_X-1:LARGE_X] = 1000
     assert np.sum(a[-1].asnumpy() == 1000) == a.shape[1]
- 
+
+
 def test_expand_dims():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     res = nd.expand_dims(a, axis=1)
     assert res.shape == (a.shape[0], 1, a.shape[1])
 
+
 def test_squeeze():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     data = nd.expand_dims(a, axis=1)
     res = nd.squeeze(data)
     assert res.shape == a.shape
 
+
 def test_broadcast_div():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.ones(shape=(LARGE_X, 1)) * 2
     res = a / b
     assert np.sum(res[-1].asnumpy() == 0.5) == a.shape[1]
 
+
 def test_Dense(ctx=mx.cpu(0)):
     data = mx.nd.ones(shape=(50*1000*1000, 100))
     linear = gluon.nn.Dense(100)
@@ -148,6 +167,7 @@ def test_Dense(ctx=mx.cpu(0)):
     res.wait_to_read()
     assert res.shape == (50000000, 100)
 
+
 def test_where():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
@@ -159,12 +179,14 @@ def test_where():
     res = nd.sparse.where(csr_cond, a, b)
     assert np.sum(res[0].asnumpy() == 1) == b.shape[1]
 
+
 def test_pick():
     a = mx.nd.ones(shape=(256*35, 1024*1024))
     b = mx.nd.ones(shape=(256*35,))
     res = mx.nd.pick(a,b)
     assert res.shape == b.shape
 
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From b86e630632b1cd8ae6e9621b1aa75f56b8e8a2ba Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 18 Apr 2019 11:59:19 -0700
Subject: [PATCH 26/30] fix CI

---
 ci/jenkins/Jenkins_steps.groovy | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 0f648c694b59..18bb59cc9a37 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -126,7 +126,7 @@ def compile_unix_int64_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_large_tensor', false)
-            utils.pack_lib('cpu', mx_lib, true)
+            utils.pack_lib('ubuntu_cpu_int64', mx_lib, true)
           }
         }
       }
@@ -140,7 +140,7 @@ def compile_unix_int64_gpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_large_tensor', false)
-            utils.pack_lib('gpu', mx_lib, true)
+            utils.pack_lib('ubuntu_gpu_int64', mx_lib, true)
           }
         }
       }

From d8b04b37eacd9cd357bbf5a7436d8a0f604eb93e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 18 Apr 2019 23:28:06 -0700
Subject: [PATCH 27/30] add runtime feature detect for the compiler flag

---
 include/mxnet/libinfo.h | 6 +++++-
 src/libinfo.cc          | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/mxnet/libinfo.h b/include/mxnet/libinfo.h
index f35d41a9aa8a..8b58a398c673 100644
--- a/include/mxnet/libinfo.h
+++ b/include/mxnet/libinfo.h
@@ -123,7 +123,9 @@
 #define MXNET_USE_SIGNAL_HANDLER 0
 #endif
 
-
+#ifndef MXNET_USE_INT64_TENSOR_SIZE
+#define MXNET_USE_INT64_TENSOR_SIZE MSHADOW_INT64_TENSOR_SIZE
+#endif
 
 namespace mxnet {
 namespace features {
@@ -177,6 +179,8 @@ enum : unsigned {
   PROFILER,
   DIST_KVSTORE,
   CXX14,
+  INT64_TENSOR_SIZE,
+
   // Signal handler to print stack traces on exceptions
   SIGNAL_HANDLER,
   DEBUG,
diff --git a/src/libinfo.cc b/src/libinfo.cc
index 2af61eac9eca..f67b45ed1c14 100644
--- a/src/libinfo.cc
+++ b/src/libinfo.cc
@@ -86,7 +86,9 @@ class FeatureSet {
     // Misc
     feature_bits.set(CAFFE, MXNET_USE_CAFFE);
     feature_bits.set(DIST_KVSTORE, MXNET_USE_DIST_KVSTORE);
+    feature_bits.set(INT64_TENSOR_SIZE, MXNET_USE_INT64_TENSOR_SIZE);
     feature_bits.set(SIGNAL_HANDLER, MXNET_USE_SIGNAL_HANDLER);
+
 #ifndef NDEBUG
     feature_bits.set(DEBUG);
 #endif
@@ -154,6 +156,7 @@ const std::vector<std::string> EnumNames::names = {
   "PROFILER",
   "DIST_KVSTORE",
   "CXX14",
+  "INT64_TENSOR_SIZE",
   "SIGNAL_HANDLER",
   "DEBUG",
 };

From 20221d63e710ad39ad63372ca333091b88162763 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 19 Apr 2019 11:40:38 -0700
Subject: [PATCH 28/30] change build from make to cmake

---
 ci/docker/runtime_functions.sh | 56 ++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index d0fcc142d422..8a0e31672136 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -757,31 +757,49 @@ build_ubuntu_gpu_cmake() {
 
 build_ubuntu_cpu_large_tensor() {
     set -ex
+    cd /work/build
     build_ccache_wrappers
-    export CC="ccache gcc"
-    export CXX="ccache g++"
-    make  \
-        DEV=1                         \
-        ENABLE_TESTCOVERAGE=1         \
-        USE_CPP_PACKAGE=1             \
-        USE_INT64_TENSOR_SIZE=1       \
-        USE_SIGNAL_HANDLER=1          \
-        -j$(nproc)
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DENABLE_TESTCOVERAGE=ON                \
+        -DUSE_CUDA=OFF                          \
+        -DUSE_CUDNN=OFF                         \
+        -DUSE_MKLDNN=OFF                        \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DUSE_INT64_TENSOR_SIZE=ON              \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja -v
 }
 
 build_ubuntu_gpu_large_tensor() {
     set -ex
+    cd /work/build
     build_ccache_wrappers
-    make  \
-        DEV=1                                     \
-        ENABLE_TESTCOVERAGE=1                     \
-        USE_CUDA=1                                \
-        USE_CUDA_PATH=/usr/local/cuda             \
-        USE_CUDNN=1                               \
-        CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
-        USE_INT64_TENSOR_SIZE=1                   \
-        USE_SIGNAL_HANDLER=1                      \
-        -j$(nproc)
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DENABLE_TESTCOVERAGE=ON                \
+        -DUSE_CUDA=ON                           \
+        -DUSE_CUDNN=ON                          \
+        -DUSE_MKL_IF_AVAILABLE=OFF              \
+        -DUSE_MKLML_MKL=OFF                     \
+        -DUSE_MKLDNN=OFF                        \
+        -DUSE_DIST_KVSTORE=ON                   \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DCUDA_ARCH_NAME=Manual                 \
+        -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DUSE_INT64_TENSOR_SIZE=ON              \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja -v
 }
 
 build_ubuntu_blc() {

From bc95113af630e10e05a2d3c769093501dbc64958 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 19 Apr 2019 13:17:29 -0700
Subject: [PATCH 29/30] fix CI

---
 ci/jenkins/Jenkins_steps.groovy | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index fa87aaa59e27..23230ac0442f 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -126,7 +126,7 @@ def compile_unix_int64_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_large_tensor', false)
-            utils.pack_lib('ubuntu_cpu_int64', mx_lib, true)
+            utils.pack_lib('ubuntu_cpu_int64', mx_cmake_lib, true)
           }
         }
       }
@@ -140,7 +140,7 @@ def compile_unix_int64_gpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_large_tensor', false)
-            utils.pack_lib('ubuntu_gpu_int64', mx_lib, true)
+            utils.pack_lib('ubuntu_gpu_int64', mx_cmake_lib, true)
           }
         }
       }

From 9c672b7852321308f945b141e89a66e6a523f781 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 19 Apr 2019 17:12:45 -0700
Subject: [PATCH 30/30] move tests to nightly

---
 ci/docker/runtime_functions.sh       |  2 +-
 tests/nightly/JenkinsfileForBinaries | 31 ++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 8a0e31672136..c3610d2452e0 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1234,7 +1234,7 @@ nightly_test_KVStore_singleNode() {
 nightly_test_large_tensor() {
     set -ex
     export PYTHONPATH=./python/
-    python tests/nightly/test_large_array.py
+    nosetests-3.4 tests/nightly/test_large_array.py
 }
 
 #Tests Amalgamation Build with 5 different sets of flags
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index 9625d707211b..13bb50e0e484 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -19,6 +19,7 @@
 //This is a Jenkinsfile for nightly tests. The format and some functions have been picked up from the top-level Jenkinsfile
 
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
@@ -39,6 +40,24 @@ core_logic: {
           utils.pack_lib('gpu', mx_lib)
         }
       }
+    },
+    'CPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cpu-int64') {
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'build_ubuntu_cpu_large_tensor', false)
+          utils.pack_lib('ubuntu_cpu_int64', mx_cmake_lib, true)
+        }
+      }
+    },
+    'GPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/build-gpu-int64') {
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_gpu', 'build_ubuntu_gpu_large_tensor', true)
+          utils.pack_lib('ubuntu_gpu_int64', mx_cmake_lib, true)
+        }
+      }
     }
   }
 
@@ -62,19 +81,19 @@ core_logic: {
     'Test Large Tensor Size: CPU': {
       node(NODE_LINUX_CPU) {
         ws('workspace/large_tensor-cpu') {
-            utils.unpack_and_init('cpu', mx_lib)
-            utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_large_tensor USE_INT64_TENSOR_SIZE=1', false)
+            utils.unpack_and_init('cpu_int64', mx_cmake_lib)
+            utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_large_tensor', false)
         }
       }
-    }
+    },
     'Test Large Tensor Size: GPU': {
       node(NODE_LINUX_GPU) {
         ws('workspace/large_tensor-gpu') {
-            utils.unpack_and_init('gpu', mx_lib)
-            utils.docker_run('ubuntu_nightly_gpu', 'nightly_test_large_tensor USE_INT64_TENSOR_SIZE=1', false)
+            utils.unpack_and_init('gpu_int64', mx_cmake_lib)
+            utils.docker_run('ubuntu_nightly_gpu', 'nightly_test_large_tensor', true)
         }
       }
-    }
+    },
     'StraightDope: Python2 Single-GPU': {
       node(NODE_LINUX_GPU_P3) {
         ws('workspace/straight_dope-single_gpu') {