WIP on (no branch): fd03c70 fix sgn 0size

cangtianhuang · cangtianhuang · commit 300ff1c2e51c · 2025-06-24T23:23:31.000+08:00
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -148,43 +148,38 @@ struct Identity<T, ComplexSum> {
 template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, typename Op>
 __global__ void BlockScanKernel(T* d_out,
                                 const T* d_in,
+                                T* d_agg,
                                 int64_t grid_size,
                                 int64_t scan_size,
                                 bool exclusive,
                                 Op op) {
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
 
-  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
-  typedef cub::
-      BlockLoad<MT, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
-          BlockLoadT;
-  typedef cub::BlockStore<MT,
-                          BLOCK_THREADS,
-                          ITEMS_PER_THREAD,
-                          cub::BLOCK_STORE_TRANSPOSE>
-      BlockStoreT;
-  typedef cub::BlockScan<MT, BLOCK_THREADS> BlockScanT;
+  // Specialize BlockLoad, BlockStore, BlockScanT, BlockReduceT collective types
+  using BlockLoadT = cub::BlockLoad<MT, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+  using BlockStoreT = cub::BlockStore<MT, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  using BlockScanT = cub::BlockScan<MT, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>;
+  using BlockReduceT = cub::BlockReduce<MT, BLOCK_THREADS>;
+
   // Allocate type-safe, repurposable shared memory for collectives
   __shared__ union {
     typename BlockLoadT::TempStorage load;
     typename BlockStoreT::TempStorage store;
     typename BlockScanT::TempStorage scan;
+    typename BlockReduceT::TempStorage reduce;
   } temp_storage;
 
   // Obtain this block's segment of consecutive keys (blocked across threads)
   int64_t item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD;
 
   for (int64_t bx = blockIdx.x; bx < grid_size; bx += gridDim.x) {
+    int64_t block_offset = bx * scan_size;
     BlockPrefixCallbackOp<MT, Op> prefix_op(Identity<MT, Op>::value, op);
 
-    for (int64_t block_offset = 0; block_offset < scan_size;
-         block_offset += item_per_block) {
+    for (int64_t offset = 0; offset < scan_size; offset += item_per_block) {
       int64_t valid_item = (scan_size - block_offset > item_per_block)
                                ? item_per_block
                                : (scan_size - block_offset);
-      if (scan_size < item_per_block) {
-        valid_item = scan_size;
-      }
 
       int64_t offset = bx * scan_size + block_offset;
 
@@ -376,7 +371,6 @@ void ScanKernel(const Context& dev_ctx,
   if (!transpose && !reverse) {
     BlockScanKernel<T, 128, 4, Op><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
         out_data, in_data, grid_size, scan_size, exclusive, op);
-
   } else {
     BlockScanKernel<T, 128, 4, Op><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
         next_out_data, next_in_data, grid_size, scan_size, exclusive, op);