Minor fixes sparse ops (apache#160)

* change CPU kernel inline directives, data types, and function doc * update dot dtype switch to use 32 and 64bit floating point only * use type_assign instead of STORAGE_TYPE_ASSIGN_CHECK * added tensor_util-inl.cuh file for common tensor operator GPU kernels
reminisce · Aug 14, 2017 · 889a09e · 889a09e
1 parent 8ebc012
commit 889a09e
Show file tree

Hide file tree

Showing 5 changed files with 396 additions and 352 deletions.
diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
@@ -25,162 +25,20 @@
 #ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
 #define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
 
+#include <cub/cub.cuh>
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include <nnvm/tuple.h>
-
-#include <cub/cub.cuh>
+#include "./util/tensor_util-inl.cuh"
 
 namespace mxnet {
 namespace op {
 
 /*!
- * \brief Thread kernel for marking non-zero rows of a tensor.
- * Parallelized by tensor rows: 1 thread/row
- */
-struct MarkRspRowIdxThreadKernel {
-  /*!
-   * \brief
-   * \param tid         global thread id
-   * \param row_flg     row flag array to mark non-zero rows
-   * \param dns         dense matrix data
-   * \param num_rows    number of rows (size of first dimension of tensor)
-   * \param row_length  number of elements per row
-   */
-  template<typename DType, typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_flg,
-                                             const DType* dns,
-                                             const nnvm::dim_t num_rows,
-                                             const nnvm::dim_t row_length) {
-    using nnvm::dim_t;
-    if (tid < num_rows) {
-      dim_t j = 0;
-      dim_t offset = tid * row_length;
-      for (; j < row_length; ++j) {
-        if (dns[offset+j] != 0) {
-          break;
-        }
-      }
-      if (j < row_length) {
-        row_flg[tid] = 1;  // mark as one for non-zero row
-      } else {
-        row_flg[tid] = 0;  // mark as zero for zero row
-      }
-    }
-  }
-};
-
-/*!
- * \brief Warp kernel for marking non-zero rows of a tensor.
- * Parallelized by tensor rows: 1 warp/row
- */
-struct MarkRspRowIdxWarpKernel {
-  template<typename DType, typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_flg,
-                                             const DType* dns,
-                                             const nnvm::dim_t num_rows,
-                                             const nnvm::dim_t row_length) {
-    using nnvm::dim_t;
-    typedef cub::WarpReduce<dim_t> WarpReduce;
-    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
-    __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
-
-    const dim_t warp_id   = tid / 32;          // global warp   id
-    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
-    const dim_t lane      = tid & (32-1);      // local  thread id within warp
-
-    if (warp_id < num_rows) {
-      dim_t flg = 0;
-      dim_t offset = warp_id * row_length;
-      for (dim_t j = lane; j < row_length; j+=32) {
-        if (dns[offset+j] != 0) {
-          // avoid break: causes slower performance on sparse tensors (<20% density),
-          // due to thread divergence
-          flg++;
-        }
-      }
-      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg);
-      if (lane == 0) {
-        if (aggr > 0) {
-          row_flg[warp_id] = 1;  // mark as one for non-zero row
-        } else {
-          row_flg[warp_id] = 0;  // mark as zero for zero row
-        }
-      }
-    }
-  }
-};
-
-/*!
- * \brief Block kernel for marking non-zero rows of a tensor.
- * Parallelized by tensor rows: 1 threadBlock/row
- */
-struct MarkRspRowIdxBlockKernel {
-  template<typename DType, typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_flg,
-                                             const DType* dns,
-                                             const nnvm::dim_t num_rows,
-                                             const nnvm::dim_t row_length) {
-    using nnvm::dim_t;
-    using mshadow::cuda::kBaseThreadNum;
-    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    if (blockIdx.x < num_rows) {
-      dim_t flg = 0;
-      dim_t offset = blockIdx.x * row_length;
-      for (dim_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) {
-        if (dns[offset+j] != 0) {
-          // avoid break: causes slower performance on sparse tensors (<20% density),
-          // due to thread divergence
-          flg++;
-        }
-      }
-      dim_t aggr = BlockReduce(temp_storage).Sum(flg);
-      if (threadIdx.x == 0) {
-        if (aggr > 0) {
-          row_flg[blockIdx.x] = 1;  // mark as one for non-zero row
-        } else {
-          row_flg[blockIdx.x] = 0;  // mark as zero for zero row
-        }
-      }
-    }
-  }
-};
-
-/*!
- * \brief Kernel for filling the row index array of the rsp tensor.
- * Parallelized by tensor rows: 1 thread/row
- */
-struct FillRspRowIdxKernel {
-  /*!
-   * \brief
-   * \param tid          global thread id
-   * \param row_idx      row index array to store indices of non-zero rows
-   * \param row_flg_sum  inclusive prefix sum array over marked row flag array
-   * \param num_rows     number of rows (size of first dimension of tensor)
-   */
-  template<typename RType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             RType* row_idx,
-                                             const RType* row_flg_sum,
-                                             const nnvm::dim_t num_rows) {
-    if (tid < num_rows) {
-      nnvm::dim_t prev = (tid == 0)? 0 : row_flg_sum[tid-1];
-      if (row_flg_sum[tid] > prev) {
-        row_idx[prev] = tid;
-      }
-    }
-  }
-};
-
-/*!
- * \brief Kernel for filling the value array of the rsp tensor.
+ * \brief GPU Kernel for filling the value array of the rsp tensor.
  * Parallelized by rsp tensor elements: 1 thread/element
  */
-struct FillRspValsKernel {
+struct CastDnsRspValsKernel {
   /*!
    * \brief
    * \param tid         global thread id
@@ -243,7 +101,7 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
         LOG(FATAL) << "CastStorageDnsRspImpl GPU kernels expect warpSize=32";
       }
       // Determine temporary device storage requirements
-      RType* row_flg = NULL;
+      dim_t* row_flg = NULL;
       void* d_temp_storage = NULL;
       size_t temp_storage_bytes = 0;
       cub::DeviceScan::InclusiveSum(d_temp_storage,
@@ -254,10 +112,10 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
                                     mshadow::Stream<gpu>::GetStream(s));
 
       // Allocate temp storage for marking non-zero rows and for cub's prefix sum
-      auto workspace = AllocateTempDataForCast<gpu, 1, char>(ctx, Shape1(num_rows*sizeof(RType)
+      auto workspace = AllocateTempDataForCast<gpu, 1, char>(ctx, Shape1(num_rows*sizeof(dim_t)
                                                                           + temp_storage_bytes));
-      row_flg = reinterpret_cast<RType*>(workspace.dptr_);
-      d_temp_storage = workspace.dptr_ + num_rows*sizeof(RType);
+      row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
+      d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
 
       // Mark non-zero rows as 'one' in row_flg
       // Different kernel versions are optimized for different matrix instances
@@ -268,31 +126,31 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
       switch (kernel_version) {
         case 1:
           num_threads = num_rows;
-          Kernel<MarkRspRowIdxThreadKernel, gpu>::Launch(s, num_threads,
+          Kernel<MarkRspRowThreadKernel, gpu>::Launch(s, num_threads,
               row_flg, dns.dptr<DType>(), num_rows, row_length);
           break;
         case 2:
           num_threads = num_rows * threads_per_warp;
-          Kernel<MarkRspRowIdxWarpKernel, gpu>::Launch(s, num_threads,
+          Kernel<MarkRspRowWarpKernel, gpu>::Launch(s, num_threads,
               row_flg, dns.dptr<DType>(), num_rows, row_length);
           break;
         case 3:
           num_threads = num_rows * threads_per_block;
-          Kernel<MarkRspRowIdxBlockKernel, gpu>::Launch(s, num_threads,
+          Kernel<MarkRspRowBlockKernel, gpu>::Launch(s, num_threads,
               row_flg, dns.dptr<DType>(), num_rows, row_length);
           break;
         default:
           if (row_length < threads_per_warp) {
             num_threads = num_rows;
-            Kernel<MarkRspRowIdxThreadKernel, gpu>::Launch(s, num_threads,
+            Kernel<MarkRspRowThreadKernel, gpu>::Launch(s, num_threads,
                 row_flg, dns.dptr<DType>(), num_rows, row_length);
           } else if (row_length < threads_per_block || num_rows > min_num_warps) {
             num_threads = num_rows * threads_per_warp;
-            Kernel<MarkRspRowIdxWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<MarkRspRowWarpKernel, gpu>::Launch(s, num_threads,
                 row_flg, dns.dptr<DType>(), num_rows, row_length);
           } else {
             num_threads = num_rows * threads_per_block;
-            Kernel<MarkRspRowIdxBlockKernel, gpu>::Launch(s, num_threads,
+            Kernel<MarkRspRowBlockKernel, gpu>::Launch(s, num_threads,
                 row_flg, dns.dptr<DType>(), num_rows, row_length);
           }
           break;
@@ -306,11 +164,11 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
                                     mshadow::Stream<gpu>::GetStream(s));
 
       // Get total number of non-zero rows from device
-      RType nnr = 0;
-      CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(RType), cudaMemcpyDeviceToHost));
+      dim_t nnr = 0;
+      CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(dim_t), cudaMemcpyDeviceToHost));
 
       // Allocate rsp tensor row index array and fill
-      rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(static_cast<dim_t>(nnr)));
+      rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(nnr));
       if (0 == nnr) return;
       RType* row_idx = rsp->aux_data(rowsparse::kIdx).dptr<RType>();
       num_threads = num_rows;
@@ -322,7 +180,7 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
       storage_shape[0] = nnr;
       rsp->CheckAndAllocData(storage_shape);
       num_threads = nnr * row_length;
-      Kernel<FillRspValsKernel, gpu>::Launch(s, num_threads,
+      Kernel<CastDnsRspValsKernel, gpu>::Launch(s, num_threads,
           rsp->data().dptr<DType>(), row_idx, dns.dptr<DType>(), nnr, row_length);
     });
   });
@@ -332,7 +190,7 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
  * \brief Thread kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 thread/row
  */
-struct FillCsrIndPtrThreadKernel {
+struct CastDnsCsrIndPtrThreadKernel {
   /*!
    * \brief
    * \param tid       global thread id
@@ -368,7 +226,7 @@ struct FillCsrIndPtrThreadKernel {
  * \brief Thread kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 thread/row
  */
-struct FillCsrColIdxAndValsThreadKernel {
+struct CastDnsCsrColIdxAndValsThreadKernel {
   /*!
    * \brief
    * \param tid       global thread id
@@ -406,7 +264,7 @@ struct FillCsrColIdxAndValsThreadKernel {
  * \brief Warp kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 warp/row
  */
-struct FillCsrIndPtrWarpKernel {
+struct CastDnsCsrIndPtrWarpKernel {
   template<typename DType, typename IType>
   __device__ __forceinline__ static void Map(int tid,
                                              IType* indptr,
@@ -444,7 +302,7 @@ struct FillCsrIndPtrWarpKernel {
  * \brief Warp kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 warp/row
  */
-struct FillCsrColIdxAndValsWarpKernel {
+struct CastDnsCsrColIdxAndValsWarpKernel {
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* val,
@@ -498,7 +356,7 @@ struct FillCsrColIdxAndValsWarpKernel {
  * \brief Block kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 threadBlock/row
  */
-struct FillCsrIndPtrBlockKernel {
+struct CastDnsCsrIndPtrBlockKernel {
   template<typename DType, typename IType>
   __device__ __forceinline__ static void Map(int tid,
                                              IType* indptr,
@@ -533,7 +391,7 @@ struct FillCsrIndPtrBlockKernel {
  * \brief Block kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 threadBlock/row
  */
-struct FillCsrColIdxAndValsBlockKernel {
+struct CastDnsCsrColIdxAndValsBlockKernel {
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* val,
@@ -620,31 +478,31 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
         switch (kernel_version) {
           case 1:
             num_threads = num_rows;
-            Kernel<FillCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 2:
             num_threads = num_rows * threads_per_warp;
-            Kernel<FillCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 3:
             num_threads = num_rows * threads_per_block;
-            Kernel<FillCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           default:
             if (num_cols < threads_per_warp) {
               num_threads = num_rows;
-              Kernel<FillCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
                   indptr, dns_data, num_rows, num_cols);
             } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
               num_threads = num_rows * threads_per_warp;
-              Kernel<FillCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
                   indptr, dns_data, num_rows, num_cols);
             } else {
               num_threads = num_rows * threads_per_block;
-              Kernel<FillCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
                   indptr, dns_data, num_rows, num_cols);
             }
             break;
@@ -685,36 +543,36 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
         switch (kernel_version) {
           case 1:
             num_threads = num_rows;
-            Kernel<FillCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 2:
             num_threads = num_rows * threads_per_warp;
-            Kernel<FillCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 3:
             num_threads = num_rows * threads_per_block;
-            Kernel<FillCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+            Kernel<CastDnsCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           default:
             if (num_cols < threads_per_warp) {
               num_threads = num_rows;
-              Kernel<FillCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
                   csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                   indptr, dns_data, num_rows, num_cols);
             } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
               num_threads = num_rows * threads_per_warp;
-              Kernel<FillCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             } else {
               num_threads = num_rows * threads_per_block;
-              Kernel<FillCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+              Kernel<CastDnsCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             }