diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
index 1a47b68f7aac..afef53e979ea 100644
--- a/src/operator/tensor/cast_storage-inl.cuh
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -25,162 +25,20 @@
 #ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
 #define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
 
+#include <cub/cub.cuh>
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include <nnvm/tuple.h>
-
-#include <cub/cub.cuh>
+#include "./util/tensor_util-inl.cuh"
 
 namespace mxnet {
 namespace op {
 
 /*!
- * \brief Thread kernel for marking non-zero rows of a tensor.
- * Parallelized by tensor rows: 1 thread/row
- */
-struct MarkRspRowIdxThreadKernel {
-  /*!
-   * \brief
-   * \param tid         global thread id
-   * \param row_flg     row flag array to mark non-zero rows
-   * \param dns         dense matrix data
-   * \param num_rows    number of rows (size of first dimension of tensor)
-   * \param row_length  number of elements per row
-   */
-  template<typename DType, typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_flg,
-                                             const DType* dns,
-                                             const nnvm::dim_t num_rows,
-                                             const nnvm::dim_t row_length) {
-    using nnvm::dim_t;
-    if (tid < num_rows) {
-      dim_t j = 0;
-      dim_t offset = tid * row_length;
-      for (; j < row_length; ++j) {
-        if (dns[offset+j] != 0) {
-          break;
-        }
-      }
-      if (j < row_length) {
-        row_flg[tid] = 1;  // mark as one for non-zero row
-      } else {
-        row_flg[tid] = 0;  // mark as zero for zero row
-      }
-    }
-  }
-};
-
-/*!
- * \brief Warp kernel for marking non-zero rows of a tensor.
- * Parallelized by tensor rows: 1 warp/row
- */
-struct MarkRspRowIdxWarpKernel {
-  template<typename DType, typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_flg,
-                                             const DType* dns,
-                                             const nnvm::dim_t num_rows,
-                                             const nnvm::dim_t row_length) {
-    using nnvm::dim_t;
-    typedef cub::WarpReduce<dim_t> WarpReduce;
-    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
-    __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
-
-    const dim_t warp_id   = tid / 32;          // global warp   id
-    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
-    const dim_t lane      = tid & (32-1);      // local  thread id within warp
-
-    if (warp_id < num_rows) {
-      dim_t flg = 0;
-      dim_t offset = warp_id * row_length;
-      for (dim_t j = lane; j < row_length; j+=32) {
-        if (dns[offset+j] != 0) {
-          // avoid break: causes slower performance on sparse tensors (<20% density),
-          // due to thread divergence
-          flg++;
-        }
-      }
-      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg);
-      if (lane == 0) {
-        if (aggr > 0) {
-          row_flg[warp_id] = 1;  // mark as one for non-zero row
-        } else {
-          row_flg[warp_id] = 0;  // mark as zero for zero row
-        }
-      }
-    }
-  }
-};
-
-/*!
- * \brief Block kernel for marking non-zero rows of a tensor.
- * Parallelized by tensor rows: 1 threadBlock/row
- */
-struct MarkRspRowIdxBlockKernel {
-  template<typename DType, typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_flg,
-                                             const DType* dns,
-                                             const nnvm::dim_t num_rows,
-                                             const nnvm::dim_t row_length) {
-    using nnvm::dim_t;
-    using mshadow::cuda::kBaseThreadNum;
-    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    if (blockIdx.x < num_rows) {
-      dim_t flg = 0;
-      dim_t offset = blockIdx.x * row_length;
-      for (dim_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) {
-        if (dns[offset+j] != 0) {
-          // avoid break: causes slower performance on sparse tensors (<20% density),
-          // due to thread divergence
-          flg++;
-        }
-      }
-      dim_t aggr = BlockReduce(temp_storage).Sum(flg);
-      if (threadIdx.x == 0) {
-        if (aggr > 0) {
-          row_flg[blockIdx.x] = 1;  // mark as one for non-zero row
-        } else {
-          row_flg[blockIdx.x] = 0;  // mark as zero for zero row
-        }
-      }
-    }
-  }
-};
-
-/*!
- * \brief Kernel for filling the row index array of the rsp tensor.
- * Parallelized by tensor rows: 1 thread/row
- */
-struct FillRspRowIdxKernel {
-  /*!
-   * \brief
-   * \param tid          global thread id
-   * \param row_idx      row index array to store indices of non-zero rows
-   * \param row_flg_sum  inclusive prefix sum array over marked row flag array
-   * \param num_rows     number of rows (size of first dimension of tensor)
-   */
-  template<typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_idx,
-                                             const RType* row_flg_sum,
-                                             const nnvm::dim_t num_rows) {
-    if (tid < num_rows) {
-      nnvm::dim_t prev = (tid == 0)? 0 : row_flg_sum[tid-1];
-      if (row_flg_sum[tid] > prev) {
-        row_idx[prev] = tid;
-      }
-    }
-  }
-};
-
-/*!
- * \brief Kernel for filling the value array of the rsp tensor.
+ * \brief GPU Kernel for filling the value array of the rsp tensor.
  * Parallelized by rsp tensor elements: 1 thread/element
  */
-struct FillRspValsKernel {
+struct CastDnsRspValsKernel {
   /*!
    * \brief
    * \param tid         global thread id
@@ -243,7 +101,7 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
         LOG(FATAL) << "CastStorageDnsRspImpl GPU kernels expect warpSize=32";
       }
       // Determine temporary device storage requirements
-      RType* row_flg = NULL;
+      dim_t* row_flg = NULL;
       void* d_temp_storage = NULL;
       size_t temp_storage_bytes = 0;
       cub::DeviceScan::InclusiveSum(d_temp_storage,
@@ -254,10 +112,10 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
                                     mshadow::Stream<gpu>::GetStream(s));
 
       // Allocate temp storage for marking non-zero rows and for cub's prefix sum
-      auto workspace = AllocateTempDataForCast<gpu, 1, char>(ctx, Shape1(num_rows*sizeof(RType)
+      auto workspace = AllocateTempDataForCast<gpu, 1, char>(ctx, Shape1(num_rows*sizeof(dim_t)
                                                                           + temp_storage_bytes));
-      row_flg = reinterpret_cast<RType*>(workspace.dptr_);
-      d_temp_storage = workspace.dptr_ + num_rows*sizeof(RType);
+      row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
+      d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
 
       // Mark non-zero rows as 'one' in row_flg
       // Different kernel versions are optimized for different matrix instances
@@ -268,31 +126,31 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
       switch (kernel_version) {
         case 1:
           num_threads = num_rows;
-          Kernel<MarkRspRowIdxThreadKernel, gpu>::Launch(s, num_threads,
+          Kernel<MarkRspRowThreadKernel, gpu>::Launch(s, num_threads,
               row_flg, dns.dptr<DType>(), num_rows, row_length);
           break;
         case 2:
           num_threads = num_rows * threads_per_warp;
-          Kernel<MarkRspRowIdxWarpKernel, gpu>::Launch(s, num_threads,
+          Kernel<MarkRspRowWarpKernel, gpu>::Launch(s, num_threads,
               row_flg, dns.dptr<DType>(), num_rows, row_length);
           break;
         case 3:
           num_threads = num_rows * threads_per_block;
-          Kernel<MarkRspRowIdxBlockKernel, gpu>::Launch(s, num_threads,
+          Kernel<MarkRspRowBlockKernel, gpu>::Launch(s, num_threads,
               row_flg, dns.dptr<DType>(), num_rows, row_length);
           break;
         default:
           if (row_length < threads_per_warp) {
             num_threads = num_rows;
-            Kernel<MarkRspRowIdxThreadKernel, gpu>::Launch(s, num_threads,
+            Kernel<MarkRspRowThreadKernel, gpu>::Launch(s, num_threads,
                 row_flg, dns.dptr<DType>(), num_rows, row_length);
           } else if (row_length < threads_per_block || num_rows > min_num_warps) {
             num_threads = num_rows * threads_per_warp;
-            Kernel<MarkRspRowIdxWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<MarkRspRowWarpKernel, gpu>::Launch(s, num_threads,
                 row_flg, dns.dptr<DType>(), num_rows, row_length);
           } else {
             num_threads = num_rows * threads_per_block;
-            Kernel<MarkRspRowIdxBlockKernel, gpu>::Launch(s, num_threads,
+            Kernel<MarkRspRowBlockKernel, gpu>::Launch(s, num_threads,
                 row_flg, dns.dptr<DType>(), num_rows, row_length);
           }
           break;
@@ -306,11 +164,11 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
                                     mshadow::Stream<gpu>::GetStream(s));
 
       // Get total number of non-zero rows from device
-      RType nnr = 0;
-      CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(RType), cudaMemcpyDeviceToHost));
+      dim_t nnr = 0;
+      CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(dim_t), cudaMemcpyDeviceToHost));
 
       // Allocate rsp tensor row index array and fill
-      rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(static_cast<dim_t>(nnr)));
+      rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(nnr));
       if (0 == nnr) return;
       RType* row_idx = rsp->aux_data(rowsparse::kIdx).dptr<RType>();
       num_threads = num_rows;
@@ -322,7 +180,7 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
       storage_shape[0] = nnr;
       rsp->CheckAndAllocData(storage_shape);
       num_threads = nnr * row_length;
-      Kernel<FillRspValsKernel, gpu>::Launch(s, num_threads,
+      Kernel<CastDnsRspValsKernel, gpu>::Launch(s, num_threads,
           rsp->data().dptr<DType>(), row_idx, dns.dptr<DType>(), nnr, row_length);
     });
   });
@@ -332,7 +190,7 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
  * \brief Thread kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 thread/row
  */
-struct FillCsrIndPtrThreadKernel {
+struct CastDnsCsrIndPtrThreadKernel {
   /*!
    * \brief
    * \param tid       global thread id
@@ -368,7 +226,7 @@ struct FillCsrIndPtrThreadKernel {
  * \brief Thread kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 thread/row
  */
-struct FillCsrColIdxAndValsThreadKernel {
+struct CastDnsCsrColIdxAndValsThreadKernel {
   /*!
    * \brief
    * \param tid       global thread id
@@ -406,7 +264,7 @@ struct FillCsrColIdxAndValsThreadKernel {
  * \brief Warp kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 warp/row
  */
-struct FillCsrIndPtrWarpKernel {
+struct CastDnsCsrIndPtrWarpKernel {
   template<typename DType, typename IType>
   __device__ __forceinline__ static void Map(int tid,
                                              IType* indptr,
@@ -444,7 +302,7 @@ struct FillCsrIndPtrWarpKernel {
  * \brief Warp kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 warp/row
  */
-struct FillCsrColIdxAndValsWarpKernel {
+struct CastDnsCsrColIdxAndValsWarpKernel {
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* val,
@@ -498,7 +356,7 @@ struct FillCsrColIdxAndValsWarpKernel {
  * \brief Block kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 threadBlock/row
  */
-struct FillCsrIndPtrBlockKernel {
+struct CastDnsCsrIndPtrBlockKernel {
   template<typename DType, typename IType>
   __device__ __forceinline__ static void Map(int tid,
                                              IType* indptr,
@@ -533,7 +391,7 @@ struct FillCsrIndPtrBlockKernel {
  * \brief Block kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 threadBlock/row
  */
-struct FillCsrColIdxAndValsBlockKernel {
+struct CastDnsCsrColIdxAndValsBlockKernel {
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* val,
@@ -620,31 +478,31 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
         switch (kernel_version) {
           case 1:
             num_threads = num_rows;
-            Kernel<FillCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 2:
             num_threads = num_rows * threads_per_warp;
-            Kernel<FillCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 3:
             num_threads = num_rows * threads_per_block;
-            Kernel<FillCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           default:
             if (num_cols < threads_per_warp) {
               num_threads = num_rows;
-              Kernel<FillCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
                   indptr, dns_data, num_rows, num_cols);
             } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
               num_threads = num_rows * threads_per_warp;
-              Kernel<FillCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
                   indptr, dns_data, num_rows, num_cols);
             } else {
               num_threads = num_rows * threads_per_block;
-              Kernel<FillCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
                   indptr, dns_data, num_rows, num_cols);
             }
             break;
@@ -685,36 +543,36 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
         switch (kernel_version) {
           case 1:
             num_threads = num_rows;
-            Kernel<FillCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 2:
             num_threads = num_rows * threads_per_warp;
-            Kernel<FillCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 3:
             num_threads = num_rows * threads_per_block;
-            Kernel<FillCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           default:
             if (num_cols < threads_per_warp) {
               num_threads = num_rows;
-              Kernel<FillCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
                   csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                   indptr, dns_data, num_rows, num_cols);
             } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
               num_threads = num_rows * threads_per_warp;
-              Kernel<FillCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             } else {
               num_threads = num_rows * threads_per_block;
-              Kernel<FillCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             }
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
index 7d4f39ff9c58..acb30a9eff2b 100644
--- a/src/operator/tensor/cast_storage-inl.h
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -43,10 +43,13 @@ namespace op {
 struct MarkRspRowIdx {
   // i represents the row index of the tensor data
   template<typename DType, typename RType>
-  MSHADOW_XINLINE static void Map(int i, RType* row_idx, const DType* data,
-                                  const index_t row_length) {
-    index_t j = 0;
-    index_t offset = i * row_length;
+  MSHADOW_CINLINE static void Map(int i,
+                                  RType* row_idx,
+                                  const DType* data,
+                                  const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    dim_t j = 0;
+    dim_t offset = i * row_length;
     for (; j < row_length; ++j) {
       if (data[offset+j] != 0) {
         break;
@@ -69,20 +72,22 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
                                   NDArray* rsp) {
   using namespace rowsparse;
   using namespace mshadow;
+  using nnvm::dim_t;
   CHECK(rsp != nullptr);
   CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
   CHECK_EQ(dns.shape_, rsp->shape());
   mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(kIdx), RType, {  // row idx type
-      const index_t num_rows = dns.shape_[0];
-      const index_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
+      const dim_t num_rows = dns.shape_[0];
+      const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
       rsp->CheckAndAllocAuxData(kIdx, Shape1(num_rows));
       TBlob row_idx_blob = rsp->aux_data(kIdx);
       RType* row_idx = row_idx_blob.dptr<RType>();
-      mxnet_op::Kernel<MarkRspRowIdx, cpu>::Launch(s, num_rows, row_idx,
-          dns.dptr<DType>(), row_length);
-      index_t nnr = 0;
+      dim_t num_threads = num_rows;
+      mxnet_op::Kernel<MarkRspRowIdx, cpu>::Launch(s, num_threads,
+          row_idx, dns.dptr<DType>(), row_length);
+      dim_t nnr = 0;
       nnr = common::ParallelAccumulate(row_idx, num_rows, nnr);
       rsp->set_aux_shape(kIdx, Shape1(nnr));
       if (0 == nnr) return;
@@ -91,8 +96,8 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
       rsp->CheckAndAllocData(storage_shape);
       auto dns_data = dns.get_with_shape<cpu, 2, DType>(Shape2(num_rows, row_length), s);
       auto rsp_data = rsp->data().get_with_shape<cpu, 2, DType>(Shape2(nnr, row_length), s);
-      size_t idx = 0;
-      for (index_t i = 0; i < num_rows; ++i) {
+      dim_t idx = 0;
+      for (dim_t i = 0; i < num_rows; ++i) {
         if (row_idx[i] > 0) {
           row_idx[idx] = i;
           Copy(rsp_data[idx], dns_data[i], s);
@@ -106,12 +111,16 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
 // TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
 struct CastStorageRspDnsKernel {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, const index_t row_length, const IType* idx,
-                                  const DType *data, DType* dns) {
-    auto rid = idx[i];
-    auto dns_offset = rid * row_length;
-    auto rsp_offset = i * row_length;
-    for (size_t col = 0; col < row_length; col++) {
+  MSHADOW_XINLINE static void Map(int i,
+                                  const nnvm::dim_t row_length,
+                                  const IType* idx,
+                                  const DType *data,
+                                  DType* dns) {
+    using nnvm::dim_t;
+    IType rid = idx[i];
+    dim_t dns_offset = rid * row_length;
+    dim_t rsp_offset = i * row_length;
+    for (dim_t col = 0; col < row_length; col++) {
       dns[dns_offset + col] = data[rsp_offset + col];
     }
   }
@@ -122,9 +131,12 @@ struct CastStorageRspDnsKernel {
  * since the shape is known at binding stage.
  */
 template<typename xpu>
-void CastStorageRspDnsImpl(const OpContext& ctx, const NDArray& rsp, TBlob* dns) {
+void CastStorageRspDnsImpl(const OpContext& ctx,
+                           const NDArray& rsp,
+                           TBlob* dns) {
   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   CHECK_EQ(rsp.storage_type(), kRowSparseStorage);
+  using nnvm::dim_t;
   MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {
     MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
       // assign zeros
@@ -134,11 +146,12 @@ void CastStorageRspDnsImpl(const OpContext& ctx, const NDArray& rsp, TBlob* dns)
         auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s).dptr_;
         auto in_data = rsp.data().dptr<DType>();
         auto out_data = dns->dptr<DType>();
-        auto num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
-        const auto shape = rsp.shape();
-        auto row_length = shape.ProdShape(1, shape.ndim());
-        mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_rows, row_length, in_idx,
-                                                               in_data, out_data);
+        auto shape = rsp.shape();
+        const dim_t num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
+        const dim_t row_length = shape.ProdShape(1, shape.ndim());
+        const dim_t num_threads = num_rows;
+        mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_threads,
+            row_length, in_idx, in_data, out_data);
       }
     });
   });
@@ -150,18 +163,22 @@ void CastStorageRspDnsImpl(const OpContext& ctx, const NDArray& rsp, TBlob* dns)
 struct FillCsrIndPtr {
   /*!
    * \brief
-   * \param i the i-th row of the dns tensor
-   * \param indptr indptr of the csr tensor
-   * \param dns the dns tensor
-   * \param num_rows
-   * \param num_cols
+   * \param i         the i-th row of the dns tensor
+   * \param indptr    the indptr of the csr tensor
+   * \param dns       the dns tensor
+   * \param num_rows  number of rows of the dns tensor
+   * \param num_cols  number of columns of the dns tensor
    */
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, IType* indptr, const DType* dns,
-                                  const int num_rows, const int num_cols) {
+  MSHADOW_CINLINE static void Map(int i,
+                                  IType* indptr,
+                                  const DType* dns,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
     indptr[i+1] = 0;
-    const int offset = i * num_cols;
-    for (int j = 0; j < num_cols; ++j) {
+    const dim_t offset = i * num_cols;
+    for (dim_t j = 0; j < num_cols; ++j) {
       if (dns[offset+j] != 0) {
         ++indptr[i+1];
       }
@@ -175,21 +192,26 @@ struct FillCsrIndPtr {
 struct FillCsrColIdxAndVals {
   /*!
    * \brief
-   * \param i the i-th row of the dns tensor
-   * \param val value array of the csr
-   * \param col_idx column idx array of the csr
-   * \param indptr indptr array of the csr
-   * \param dns the dns tensor
-   * \param num_rows number of rows of the dns
-   * \param num_cols number of columns of the dns
+   * \param i         the i-th row of the dns tensor
+   * \param val       value array of the csr tensor
+   * \param col_idx   column idx array of the csr tensor
+   * \param indptr    indptr array of the csr tensor
+   * \param dns       dns tensor
+   * \param num_rows  number of rows of the dns tensor
+   * \param num_cols  number of columns of the dns tensor
    */
   template<typename DType, typename IType, typename CType>
-  MSHADOW_XINLINE static void Map(int i, DType* val, CType* col_idx,
-                                  const IType* indptr, const DType* dns,
-                                  const index_t num_rows, const index_t num_cols) {
-    const index_t offset = i * num_cols;
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* val,
+                                  CType* col_idx,
+                                  const IType* indptr,
+                                  const DType* dns,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t offset = i * num_cols;
     IType k = indptr[i];
-    for (index_t j = 0; j < num_cols; ++j) {
+    for (dim_t j = 0; j < num_cols; ++j) {
       if (dns[offset+j] != 0) {
         val[k] = dns[offset+j];
         col_idx[k] = j;
@@ -210,29 +232,31 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
   CHECK_EQ(csr->storage_type(), kCSRStorage);
   CHECK_EQ(dns.shape_.ndim(), 2);
   CHECK_EQ(dns.shape_, csr->shape());
+  using mshadow::Shape1;
+  using nnvm::dim_t;
   mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {  // col idx type
-        const index_t num_rows = dns.shape_[0];
-        const index_t num_cols = dns.shape_[1];
+        const dim_t num_rows = dns.shape_[0];
+        const dim_t num_cols = dns.shape_[1];
         csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1));
         IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
         DType* dns_data = dns.dptr<DType>();
-        mxnet_op::Kernel<FillCsrIndPtr, cpu>::Launch(s, num_rows, indptr,
-            dns_data, num_rows, num_cols);
+        dim_t num_threads = num_rows;
+        mxnet_op::Kernel<FillCsrIndPtr, cpu>::Launch(s, num_threads,
+            indptr, dns_data, num_rows, num_cols);
         // single thread to accumulate indptr
         // indptr[num_rows] indicates the number of non-zero elements
         indptr[0] = 0;
-        for (index_t i = 0; i < num_rows; ++i) {
+        for (dim_t i = 0; i < num_rows; ++i) {
           indptr[i+1] += indptr[i];
         }
         // allocate column idx array and value array
-        csr->CheckAndAllocAuxData(csr::kIdx,
-                                  mshadow::Shape1(static_cast<index_t>(indptr[num_rows])));
-        csr->CheckAndAllocData(mshadow::Shape1(static_cast<index_t>(indptr[num_rows])));
+        csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<index_t>(indptr[num_rows])));
+        csr->CheckAndAllocData(Shape1(static_cast<index_t>(indptr[num_rows])));
         // fill col_idx and value arrays of the csr
-        mxnet_op::Kernel<FillCsrColIdxAndVals, cpu>::Launch(s, num_rows,
+        mxnet_op::Kernel<FillCsrColIdxAndVals, cpu>::Launch(s, num_threads,
             csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
             indptr, dns_data, num_rows, num_cols);
       });
@@ -246,19 +270,22 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
 struct CopyCsrDataToDns {
   /*!
    * \brief
-   * \param i the i-th row of the dns tensor
-   * \param dns_data data blob of the dns tensor
-   * \param col_idx column idx array of the csr
-   * \param indptr indptr array of the csr
-   * \param csr_data data blob of the csr tensor
-   * \param num_cols number of columns of the dns
+   * \param i         the i-th row of the dns tensor
+   * \param dns_data  data blob of the dns tensor
+   * \param col_idx   column idx array of the csr tensor
+   * \param indptr    indptr array of the csr tensor
+   * \param csr_data  data blob of the csr tensor
+   * \param num_cols  number of columns of the dns tensor
    */
   template<typename DType, typename IType, typename CType>
-  MSHADOW_XINLINE static void Map(int i, DType* dns_data, const CType* col_idx,
-                                  const IType* indptr, const DType* csr_data,
-                                  const int num_cols) {
-    const int offset = i * num_cols;
-    for (auto j = indptr[i]; j < indptr[i+1]; ++j) {
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType* dns_data,
+                                  const CType* col_idx,
+                                  const IType* indptr,
+                                  const DType* csr_data,
+                                  const nnvm::dim_t num_cols) {
+    const nnvm::dim_t offset = i * num_cols;
+    for (IType j = indptr[i]; j < indptr[i+1]; ++j) {
       dns_data[offset+col_idx[j]] = csr_data[j];
     }
   }
@@ -268,25 +295,30 @@ struct CopyCsrDataToDns {
  * \brief Casts a csr matrix to dns format.
  */
 template<typename xpu>
-void CastStorageCsrDnsImpl(const OpContext& ctx, const NDArray& csr, TBlob* dns) {
+void CastStorageCsrDnsImpl(const OpContext& ctx,
+                           const NDArray& csr,
+                           TBlob* dns) {
   CHECK(dns != nullptr);
   CHECK_EQ(csr.storage_type(), kCSRStorage);
   CHECK_EQ(dns->shape_.ndim(), 2);
   CHECK_EQ(dns->shape_, csr.shape());
+  using nnvm::dim_t;
   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, {  // col idx type
-        const index_t num_rows = dns->shape_[0];
-        const index_t num_cols = dns->shape_[1];
+        const dim_t num_rows = dns->shape_[0];
+        const dim_t num_cols = dns->shape_[1];
         DType* dns_data = dns->dptr<DType>();
-        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, dns->shape_.Size(), dns_data);
+        dim_t num_threads = dns->shape_.Size();
+        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, num_threads, dns_data);
         if (!csr.storage_initialized()) return;
         const IType* indptr = csr.aux_data(csr::kIndPtr).dptr<IType>();
         const CType* col_idx = csr.aux_data(csr::kIdx).dptr<CType>();
         const DType* csr_data = csr.data().dptr<DType>();
-        mxnet_op::Kernel<CopyCsrDataToDns, xpu>::Launch(s, num_rows, dns_data,
-            col_idx, indptr, csr_data, num_cols);
+        num_threads = num_rows;
+        mxnet_op::Kernel<CopyCsrDataToDns, xpu>::Launch(s, num_threads,
+            dns_data, col_idx, indptr, csr_data, num_cols);
       });
     });
   });
diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh
index f8cd4faf8632..41c3faaf419f 100644
--- a/src/operator/tensor/dot-inl.cuh
+++ b/src/operator/tensor/dot-inl.cuh
@@ -27,97 +27,11 @@
 
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
-
-#include <cub/cub.cuh>
+#include "./util/tensor_util-inl.cuh"
 
 namespace mxnet {
 namespace op {
 
-/*!
- * \brief GPU auxiliary kernel to flag non-zero rows of an rsp matrix with indices.
- * Parallelized by matrix rows: 1 thread/row
- */
-struct SetRspRowFlgKernel {
-  /*!
-   * \brief
-   * \param tid      global thread id
-   * \param row_flg  array to flag storage indices of non-zero rows
-   * \param row_idx  rsp matrix row index array storing indices of non-zero rows
-   * \param nnr      rsp matrix number of non-zero rows (storage shape)
-   */
-  template<typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_flg,
-                                             const RType* row_idx,
-                                             const nnvm::dim_t nnr) {
-    if (tid < nnr) {
-      row_flg[row_idx[tid]] = tid+1;
-    }
-  }
-};
-
-/*!
- * \brief GPU auxiliary kernel for marking non-zero columns of a csr matrix.
- * Parallelized by matrix rows: 1 warp/row
- */
-struct MarkCsrZeroColsWarpKernel {
-  /*!
-   * \brief
-   * \param tid       global thread id
-   * \param col_idx   csr matrix column indices
-   * \param indptr    csr matrix row index pointer
-   * \param num_rows  csr matrix number of rows
-   * \param num_cols  csr matrix number of columns
-   */
-  template<typename CType, typename IType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             nnvm::dim_t* flg,
-                                             const CType* col_idx,
-                                             const IType* indptr,
-                                             const nnvm::dim_t num_rows,
-                                             const nnvm::dim_t num_cols) {
-    typedef unsigned long long int uint64_cu;
-    static_assert(sizeof(uint64_cu) == sizeof(nnvm::dim_t), "unexpected sizeof dim_t");
-
-    const nnvm::dim_t warp_id = tid / 32;      // global warp   id
-    const nnvm::dim_t lane    = tid & (32-1);  // local  thread id within warp
-
-    if (warp_id < num_rows) {
-      uint64_cu zero = 0;
-      uint64_cu one = 1;
-      for (IType j = indptr[warp_id]+lane; j < indptr[warp_id+1]; j+=32) {
-        atomicCAS(reinterpret_cast<uint64_cu*>(flg+col_idx[j]), zero, one);
-      }
-    }
-  }
-};
-
-/*!
- * \brief GPU auxiliary kernel for filling the row index array of an rsp matrix.
- * Parallelized by matrix rows: 1 thread/row
- */
-struct FillRspRowIdxKernel {
-  /*!
-   * \brief
-   * \param tid          global thread id
-   * \param row_idx      row index array to store indices of non-zero rows
-   * \param row_flg_sum  inclusive prefix sum array over 0/1 marked row flag array
-   * \param num_rows     rsp matrix number of rows (shape)
-   */
-  template<typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_idx,
-                                             const nnvm::dim_t* row_flg_sum,
-                                             const nnvm::dim_t num_rows) {
-    if (tid < num_rows) {
-      nnvm::dim_t prev = (tid == 0)? 0 : row_flg_sum[tid-1];
-      if (row_flg_sum[tid] > prev) {
-        row_idx[prev] = static_cast<RType>(tid);
-      }
-    }
-  }
-};
-
 /*!
  * \brief GPU scalar kernel of dot(csr, dns1) = dns2
  * Parallelization by output matrix elements: 1 thread/element
@@ -721,7 +635,7 @@ inline void DotCsrDnsRspImpl(const OpContext& ctx,
           num_threads = num_cols_l;
           Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_out);
           num_threads = num_rows_l * threads_per_warp;
-          Kernel<MarkCsrZeroColsWarpKernel, gpu>::Launch(s, num_threads,
+          Kernel<MarkCsrColWarpKernel, gpu>::Launch(s, num_threads,
               row_flg_out, col_idx_l.dptr<CType>(), indptr_l.dptr<IType>(),
               num_rows_l, num_cols_l);
           cub::DeviceScan::InclusiveSum(d_temp_storage,
@@ -840,7 +754,7 @@ inline void DotCsrRspRspImpl(const OpContext& ctx,
             num_threads = num_cols_l;
             Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_out);
             num_threads = num_rows_l * threads_per_warp;
-            Kernel<MarkCsrZeroColsWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<MarkCsrColWarpKernel, gpu>::Launch(s, num_threads,
                 row_flg_out, col_idx_l.dptr<CType>(), indptr_l.dptr<IType>(),
                 num_rows_l, num_cols_l);
             cub::DeviceScan::InclusiveSum(d_temp_storage,
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 79b87da77011..7b7d82b01b91 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -69,7 +69,7 @@ void DotForward_(const nnvm::NodeAttrs& attrs,
       << "Binary function only support input/output with the same type";
   CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
       << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) {
       CHECK_NE(req[0], kAddTo) << "AddTo not yet suported";
       Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
@@ -127,7 +127,7 @@ void DotBackward_(const nnvm::NodeAttrs& attrs,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_NE(req[0], kWriteInplace);
   CHECK_NE(req[1], kWriteInplace);
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) {
       Tensor<xpu, 1, DType> mout_grad = inputs[0].get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> mlhs_data = inputs[1].get<xpu, 1, DType>(s);
@@ -205,11 +205,11 @@ inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   // csr has many zero columns, so the result of dot(csr.T, matrix) should be rsp
-  // TODO(stefan/haibin): don't enforce kRowSparseStorage if out_attrs has already been set
+  // TODO(stefan/haibin/jun): check type_assign return value
   if (param.transpose_a && kCSRStorage == (*in_attrs)[0]) {
-    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage);
+    type_assign(&((*out_attrs)[0]), kRowSparseStorage);
   } else {
-    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+    type_assign(&((*out_attrs)[0]), kDefaultStorage);
   }
   return true;
 }
@@ -221,11 +221,11 @@ inline bool DotBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 2U);
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+  type_assign(&((*out_attrs)[0]), kDefaultStorage);
   if (!param.transpose_a && kCSRStorage == (*in_attrs)[1]) {
-    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 1, kRowSparseStorage);
+    type_assign(&((*out_attrs)[1]), kRowSparseStorage);
   } else {
-    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 1, kDefaultStorage);
+    type_assign(&((*out_attrs)[1]), kDefaultStorage);
   }
   return true;
 }
@@ -492,7 +492,7 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
   const TBlob& data_r = rhs;
   const TBlob data_out = *ret;
 
-  MSHADOW_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
         dim_t num_threads;
@@ -549,7 +549,7 @@ inline void DotCsrDnsRspImpl(const OpContext& ctx,
   const TBlob data_out = ret->data();
   const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx);
 
-  MSHADOW_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
         MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, {  // row idx type
@@ -609,7 +609,7 @@ inline void DotCsrRspDnsImpl(const OpContext& ctx,
   mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
     if (kWriteTo == req) {
-      MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {  // data type
+      MSHADOW_SGL_DBL_TYPE_SWITCH(ret->type_flag_, DType, {  // data type
         mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(
             s, ret->Size(), ret->dptr<DType>());
       });
@@ -624,7 +624,7 @@ inline void DotCsrRspDnsImpl(const OpContext& ctx,
   const TBlob data_r = rhs.data();
   const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
 
-  MSHADOW_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
         MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
@@ -691,7 +691,7 @@ inline void DotCsrRspRspImpl(const OpContext& ctx,
   const TBlob data_out = ret->data();
   const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx);
 
-  MSHADOW_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
         MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
@@ -861,7 +861,7 @@ void BatchDotForward_(const nnvm::NodeAttrs& attrs,
       << "Binary function only support input/output with the same type";
   CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
       << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     mshadow::Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
     mshadow::Tensor<xpu, 3, DType> mlhs = inputs[0].get<xpu, 3, DType>(s);
     mshadow::Tensor<xpu, 3, DType> mrhs = inputs[1].get<xpu, 3, DType>(s);
@@ -903,7 +903,7 @@ void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
   CHECK_NE(req[0], kWriteInplace);
   CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
       << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     mshadow::Tensor<xpu, 3, DType> mout_grad = inputs[0].get<xpu, 3, DType>(s);
     mshadow::Tensor<xpu, 3, DType> mlhs_data = inputs[1].get<xpu, 3, DType>(s);
     mshadow::Tensor<xpu, 3, DType> mrhs_data = inputs[2].get<xpu, 3, DType>(s);
diff --git a/src/operator/tensor/util/tensor_util-inl.cuh b/src/operator/tensor/util/tensor_util-inl.cuh
new file mode 100644
index 000000000000..cf268e7ae9fc
--- /dev/null
+++ b/src/operator/tensor/util/tensor_util-inl.cuh
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file tensor_util-inl.cuh
+ * \brief commonly utilized tensor operator GPU kernels
+ */
+#ifndef MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_
+#define MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_
+
+#include <cub/cub.cuh>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief Thread kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct MarkRspRowThreadKernel {
+  /*!
+   * \brief
+   * \param tid         global thread id
+   * \param row_flg     row flag array to mark non-zero rows
+   * \param dns         dense matrix data
+   * \param num_rows    number of rows (size of first dimension of tensor)
+   * \param row_length  number of elements per row
+   */
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    if (tid < num_rows) {
+      dim_t j = 0;
+      dim_t offset = tid * row_length;
+      for (; j < row_length; ++j) {
+        if (dns[offset+j] != 0) {
+          break;
+        }
+      }
+      if (j < row_length) {
+        row_flg[tid] = 1;  // mark as one for non-zero row
+      } else {
+        row_flg[tid] = 0;  // mark as zero for zero row
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Warp kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 warp/row
+ */
+struct MarkRspRowWarpKernel {
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    typedef cub::WarpReduce<dim_t> WarpReduce;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+    __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
+
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
+
+    if (warp_id < num_rows) {
+      dim_t flg = 0;
+      dim_t offset = warp_id * row_length;
+      for (dim_t j = lane; j < row_length; j+=32) {
+        if (dns[offset+j] != 0) {
+          // avoid break: causes slower performance on sparse tensors (<20% density),
+          // due to thread divergence
+          flg++;
+        }
+      }
+      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg);
+      if (lane == 0) {
+        if (aggr > 0) {
+          row_flg[warp_id] = 1;  // mark as one for non-zero row
+        } else {
+          row_flg[warp_id] = 0;  // mark as zero for zero row
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Block kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 threadBlock/row
+ */
+struct MarkRspRowBlockKernel {
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    using mshadow::cuda::kBaseThreadNum;
+    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    if (blockIdx.x < num_rows) {
+      dim_t flg = 0;
+      dim_t offset = blockIdx.x * row_length;
+      for (dim_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) {
+        if (dns[offset+j] != 0) {
+          // avoid break: causes slower performance on sparse tensors (<20% density),
+          // due to thread divergence
+          flg++;
+        }
+      }
+      dim_t aggr = BlockReduce(temp_storage).Sum(flg);
+      if (threadIdx.x == 0) {
+        if (aggr > 0) {
+          row_flg[blockIdx.x] = 1;  // mark as one for non-zero row
+        } else {
+          row_flg[blockIdx.x] = 0;  // mark as zero for zero row
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel to flag non-zero rows of an rsp tensor with indices.
+ * Parallelized by matrix rows: 1 thread/row
+ */
+struct SetRspRowFlgKernel {
+  /*!
+   * \brief
+   * \param tid      global thread id
+   * \param row_flg  array to flag storage indices of non-zero rows
+   * \param row_idx  rsp matrix row index array storing indices of non-zero rows
+   * \param nnr      rsp matrix number of non-zero rows (storage shape)
+   */
+  template<typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_flg,
+                                             const RType* row_idx,
+                                             const nnvm::dim_t nnr) {
+    if (tid < nnr) {
+      row_flg[row_idx[tid]] = tid+1;
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel for filling the row index array of an rsp tensor.
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct FillRspRowIdxKernel {
+  /*!
+   * \brief
+   * \param tid          global thread id
+   * \param row_idx      row index array to store indices of non-zero rows
+   * \param row_flg_sum  inclusive prefix sum array over 0/1 marked row flag array
+   * \param num_rows     rsp tensor number of rows (shape)
+   */
+  template<typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_idx,
+                                             const nnvm::dim_t* row_flg_sum,
+                                             const nnvm::dim_t num_rows) {
+    if (tid < num_rows) {
+      nnvm::dim_t prev = (tid == 0)? 0 : row_flg_sum[tid-1];
+      if (row_flg_sum[tid] > prev) {
+        row_idx[prev] = static_cast<RType>(tid);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel for marking non-zero columns of a csr matrix.
+ * Parallelized by matrix rows: 1 warp/row
+ */
+struct MarkCsrColWarpKernel {
+  /*!
+   * \brief
+   * \param tid       global thread id
+   * \param flg       flg array to mark non-zero columns
+   * \param col_idx   csr matrix column indices
+   * \param indptr    csr matrix row index pointer
+   * \param num_rows  csr matrix number of rows
+   * \param num_cols  csr matrix number of columns
+   */
+  template<typename CType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* flg,
+                                             const CType* col_idx,
+                                             const IType* indptr,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    typedef unsigned long long int uint64_cu;
+    static_assert(sizeof(uint64_cu) == sizeof(nnvm::dim_t), "unexpected sizeof dim_t");
+
+    const nnvm::dim_t warp_id = tid / 32;      // global warp   id
+    const nnvm::dim_t lane    = tid & (32-1);  // local  thread id within warp
+
+    if (warp_id < num_rows) {
+      uint64_cu zero = 0;
+      uint64_cu one = 1;
+      for (IType j = indptr[warp_id]+lane; j < indptr[warp_id+1]; j+=32) {
+        atomicCAS(reinterpret_cast<uint64_cu*>(flg+col_idx[j]), zero, one);
+      }
+    }
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_