fix BlockPrefixCallbackOp

cangtianhuang · cangtianhuang · commit 05a1cccb9b99 · 2025-08-06T15:44:04.000+08:00
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -128,18 +128,36 @@ struct Identity<T, ComplexSum> {
   static constexpr T value = {0, 0};
 };
 
+template <typename T, typename Op, bool UseKahan>
+struct BlockPrefixCallbackOp;
+
 template <typename T, typename Op>
-struct BlockPrefixCallbackOp {
+struct BlockPrefixCallbackOp<T, Op, false> {
   // Running prefix
   T running_total_;
-  T compensation_;
   Op op_;
 
   __device__ BlockPrefixCallbackOp(T identity, Op op)
-      : running_total_(identity), compensation_(identity), op_(op) {}
+      : running_total_(identity), op_(op) {}
 
   // Callback operator to be entered by the first warp of threads in the block.
   // tid 0 is responsible for returning a value for seeding the block-wide scan.
+  __device__ T operator()(T block_aggregate) {
+    const T old_prefix = running_total_;
+    running_total_ = op_(running_total_, block_aggregate);
+    return old_prefix;
+  }
+};
+
+template <typename T, typename Op>
+struct BlockPrefixCallbackOp<T, Op, true> {
+  T running_total_;
+  T compensation_;
+  Op op_;
+
+  __device__ BlockPrefixCallbackOp(T identity, Op op)
+      : running_total_(identity), compensation_(static_cast<T>(0.0)), op_(op) {}
+
   __device__ T operator()(T block_aggregate) {
     T old_prefix = running_total_;
 
@@ -155,20 +173,23 @@ struct BlockPrefixCallbackOp {
 };
 
 template <typename T>
-struct BlockPrefixCallbackOp<T, LogAddExp> {
+struct BlockPrefixCallbackOp<T, LogAddExp, true> {
   T max_so_far_;
   T scaled_sum_;
   T compensation_;
   LogAddExp op_;
 
   __device__ BlockPrefixCallbackOp(T identity, LogAddExp op)
-      : max_so_far_(identity), scaled_sum_(0.0), compensation_(0.0), op_(op) {}
+      : max_so_far_(identity),
+        scaled_sum_(static_cast<T>(0.0)),
+        compensation_(static_cast<T>(0.0)),
+        op_(op) {}
 
   __device__ T operator()(T block_aggregate) {
     if (scaled_sum_ == 0.0) {
       max_so_far_ = block_aggregate;
-      scaled_sum_ = 1.0;
-      compensation_ = 0.0;
+      scaled_sum_ = static_cast<T>(1.0);
+      compensation_ = static_cast<T>(0.0);
       return std::numeric_limits<T>::lowest();
     }
 
@@ -195,15 +216,19 @@ struct BlockPrefixCallbackOp<T, LogAddExp> {
   }
 };
 
-template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, typename Op>
+template <typename T,
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          typename Op,
+          bool UseKahan>
 __global__ void BlockScanKernel(T* d_out,
                                 const T* d_in,
                                 int64_t grid_size,
                                 int64_t scan_size,
                                 bool exclusive,
                                 Op op) {
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-  using CallbackOp = BlockPrefixCallbackOp<MT, Op>;
+  using CallbackOp = BlockPrefixCallbackOp<MT, Op, UseKahan>;
 
   // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
   using BlockLoadT = cub::
@@ -350,14 +375,30 @@ void ScanKernel(const Context& dev_ctx,
     }
   }
 
+  // When scan_size is large, switch to Kahan scan to get better precision
+  constexpr int64_t KAHAN_SWITCH_LENGTH = 1 << 16;
+
   // Do scan
   if (!transpose && !reverse) {
-    BlockScanKernel<T, 128, 4, Op><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-        out_data, in_data, grid_size, scan_size, exclusive, op);
-
+    if (scan_size > KAHAN_SWITCH_LENGTH) {
+      BlockScanKernel<T, 128, 4, Op, true>
+          <<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+              out_data, in_data, grid_size, scan_size, exclusive, op);
+    } else {
+      BlockScanKernel<T, 128, 4, Op, false>
+          <<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+              out_data, in_data, grid_size, scan_size, exclusive, op);
+    }
   } else {
-    BlockScanKernel<T, 128, 4, Op><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-        next_out_data, next_in_data, grid_size, scan_size, exclusive, op);
+    if (scan_size > KAHAN_SWITCH_LENGTH) {
+      BlockScanKernel<T, 128, 4, Op, true>
+          <<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+              next_out_data, next_in_data, grid_size, scan_size, exclusive, op);
+    } else {
+      BlockScanKernel<T, 128, 4, Op, false>
+          <<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+              next_out_data, next_in_data, grid_size, scan_size, exclusive, op);
+    }
   }
   swap_ptr(next_in_data, next_out_data);