[API] fix paddle.cross with big tensor

ZhangX-21 · ZhangX-21 · commit 392f3a9c29bd · 2025-05-14T21:33:53.000+08:00
diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
@@ -74,13 +74,13 @@ void CrossGradKernel(const Context &dev_ctx,
                                 "But received: Input(X/Y).dims() == [%s].",
                                 input_x_dims));
   }
-  auto outer_loops = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_loops *= static_cast<int>(input_x_dims[i]);
+  int64_t outer_loops = 1;
+  for (int i = 0; i < dim; i++) {
+    outer_loops *= input_x_dims[i];
   }
-  auto slice_size = 1;
-  for (auto i = dim + 1; i < input_x_dims.size(); i++) {
-    slice_size *= static_cast<int>(input_x_dims[i]);
+  int64_t slice_size = 1;
+  for (int i = dim + 1; i < input_x_dims.size(); i++) {
+    slice_size *= input_x_dims[i];
   }
 
   int64_t numel = x.numel();
@@ -111,12 +111,12 @@ void CrossGradKernel(const Context &dev_ctx,
   dev_ctx.template Alloc<T>(output_x_grad);
   dev_ctx.template Alloc<T>(output_y_grad);
 
-  for (auto i = 0; i < outer_loops; i++) {
-    for (auto j = 0; j < 3; j++) {
-      auto dst_pos = (3 * i + j) * slice_size;
-      auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
-      auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
-      for (auto k = 0; k < slice_size; k++) {
+  for (int64_t i = 0; i < outer_loops; i++) {
+    for (int64_t j = 0; j < 3; j++) {
+      int64_t dst_pos = (3 * i + j) * slice_size;
+      int64_t in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
+      int64_t in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
+      for (int64_t k = 0; k < slice_size; k++) {
         out_dx_vec[dst_pos + k] =
             input_dout_vec[in_pos2 + k] * input_y_vec[in_pos1 + k] -
             input_dout_vec[in_pos1 + k] * input_y_vec[in_pos2 + k];
diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -75,13 +75,13 @@ void CrossKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(output);
     return;
   }
-  auto outer_loops = 1;
+  int64_t outer_loops = 1;
   for (auto i = 0; i < dim; i++) {
-    outer_loops *= static_cast<int>(input_x_dims[i]);
+    outer_loops *= input_x_dims[i];
   }
-  auto slice_size = 1;
+  int64_t slice_size = 1;
   for (auto i = dim + 1; i < input_x_dims.size(); i++) {
-    slice_size *= static_cast<int>(input_x_dims[i]);
+    slice_size *= input_x_dims[i];
   }
 
   std::vector<T> input_x_vec, input_y_vec;
@@ -91,13 +91,13 @@ void CrossKernel(const Context& dev_ctx,
 
   dev_ctx.template Alloc<T>(output);
 
-  for (auto i = 0; i < outer_loops; i++) {
-    for (auto j = 0; j < 3; j++) {
-      auto dst_pos = (3 * i + j) * slice_size;
-      auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
-      auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
+  for (int64_t i = 0; i < outer_loops; i++) {
+    for (int64_t j = 0; j < 3; j++) {
+      int64_t dst_pos = (3 * i + j) * slice_size;
+      int64_t in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
+      int64_t in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
 
-      for (auto k = 0; k < slice_size; k++) {
+      for (int64_t k = 0; k < slice_size; k++) {
         out_vec[dst_pos + k] =
             input_x_vec[in_pos1 + k] * input_y_vec[in_pos2 + k] -
             input_x_vec[in_pos2 + k] * input_y_vec[in_pos1 + k];
diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -41,12 +41,14 @@ namespace cub = hipcub;
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
-#define CUDA_KERNEL_LOOP(i, n)                            \
-  for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x, \
-               step = blockDim.x * gridDim.x;             \
-       i < (n);                                           \
+#define CUDA_KERNEL_LOOP_TYPE(i, n, index_type)              \
+  for (index_type i = blockIdx.x * blockDim.x + threadIdx.x, \
+                  step = blockDim.x * gridDim.x;             \
+       i < (n);                                              \
        i += step)
 
+#define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int32_t)
+
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaximumNumBlocks = 4096;
 
diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -24,21 +24,22 @@
 
 namespace phi {
 
-template <typename T>
-__global__ void CrossGrad(const T* x,
-                          const T* y,
-                          const T* out,
-                          T* out_dx,
-                          T* out_dy,
-                          const int64_t stride,
-                          const int64_t N,
-                          phi::funcs::IndexCalculator<int> index_calculator) {
-  CUDA_KERNEL_LOOP(i, N) {
-    int64_t offset = index_calculator(i);
-
-    auto pos0 = offset + 0 * stride;
-    auto pos1 = offset + 1 * stride;
-    auto pos2 = offset + 2 * stride;
+template <typename T, typename IndexType>
+__global__ void CrossGrad(
+    const T* x,
+    const T* y,
+    const T* out,
+    T* out_dx,
+    T* out_dy,
+    const IndexType stride,
+    const IndexType N,
+    phi::funcs::IndexCalculator<IndexType> index_calculator) {
+  CUDA_KERNEL_LOOP_TYPE(i, N, IndexType) {
+    IndexType offset = index_calculator(i);
+
+    IndexType pos0 = offset + 0 * stride;
+    IndexType pos1 = offset + 1 * stride;
+    IndexType pos2 = offset + 2 * stride;
 
     using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
 
@@ -168,11 +169,10 @@ void CrossGradKernel(const Context& dev_ctx,
   const auto* input_out_grad_data = input_out_grad.data<T>();
   auto* output_x_grad_data = dev_ctx.template Alloc<T>(x_grad);
   auto* output_y_grad_data = dev_ctx.template Alloc<T>(y_grad);
-  auto index_calculator = phi::funcs::IndexCalculator<int>(
-      merged_dims.size() - 1, cal_dims, left_strides, full_strides);
 
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel / 3);
+  constexpr int64_t int_max = std::numeric_limits<int>::max();
   if (IsComplexType(x.dtype())) {
     DenseTensor x_conj, y_conj;
     DenseTensorMeta meta_xy(x.dtype(), x.dims());
@@ -189,30 +189,67 @@ void CrossGradKernel(const Context& dev_ctx,
         input_y_data, numel, input_y_conj_data);
     for_range(functor_x);
     for_range(functor_y);
-
-    CrossGrad<<<config.block_per_grid,
-                config.thread_per_block,
-                0,
-                dev_ctx.stream()>>>(input_x_conj_data,
-                                    input_y_conj_data,
-                                    input_out_grad_data,
-                                    output_x_grad_data,
-                                    output_y_grad_data,
-                                    full_strides[merge_axis],
-                                    numel / 3,
-                                    index_calculator);
+    if (full_strides[merge_axis] * 2 > int_max || numel / 3 > int_max) {
+      auto index_calculator = phi::funcs::IndexCalculator<int64_t>(
+          merged_dims.size() - 1, cal_dims, left_strides, full_strides);
+      CrossGrad<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(input_x_conj_data,
+                                      input_y_conj_data,
+                                      input_out_grad_data,
+                                      output_x_grad_data,
+                                      output_y_grad_data,
+                                      full_strides[merge_axis],
+                                      numel / 3,
+                                      index_calculator);
+    } else {
+      auto index_calculator = phi::funcs::IndexCalculator<int32_t>(
+          merged_dims.size() - 1, cal_dims, left_strides, full_strides);
+      CrossGrad<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(
+          input_x_conj_data,
+          input_y_conj_data,
+          input_out_grad_data,
+          output_x_grad_data,
+          output_y_grad_data,
+          static_cast<int32_t>(full_strides[merge_axis]),
+          static_cast<int32_t>(numel / 3),
+          index_calculator);
+    }
   } else {
-    CrossGrad<<<config.block_per_grid,
-                config.thread_per_block,
-                0,
-                dev_ctx.stream()>>>(input_x_data,
-                                    input_y_data,
-                                    input_out_grad_data,
-                                    output_x_grad_data,
-                                    output_y_grad_data,
-                                    full_strides[merge_axis],
-                                    numel / 3,
-                                    index_calculator);
+    if (full_strides[merge_axis] * 2 > int_max || numel / 3 > int_max) {
+      auto index_calculator = phi::funcs::IndexCalculator<int64_t>(
+          merged_dims.size() - 1, cal_dims, left_strides, full_strides);
+      CrossGrad<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(input_x_data,
+                                      input_y_data,
+                                      input_out_grad_data,
+                                      output_x_grad_data,
+                                      output_y_grad_data,
+                                      full_strides[merge_axis],
+                                      numel / 3,
+                                      index_calculator);
+    } else {
+      auto index_calculator = phi::funcs::IndexCalculator<int32_t>(
+          merged_dims.size() - 1, cal_dims, left_strides, full_strides);
+      CrossGrad<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(
+          input_x_data,
+          input_y_data,
+          input_out_grad_data,
+          output_x_grad_data,
+          output_y_grad_data,
+          static_cast<int32_t>(full_strides[merge_axis]),
+          static_cast<int32_t>(numel / 3),
+          index_calculator);
+    }
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -23,19 +23,19 @@
 
 namespace phi {
 
-template <typename T>
+template <typename T, typename IndexType>
 __global__ void Cross(const T* x,
                       const T* y,
                       T* out,
-                      const int64_t stride,
-                      const int64_t N,
-                      phi::funcs::IndexCalculator<int> index_calculator) {
-  CUDA_KERNEL_LOOP(i, N) {
-    int64_t offset = index_calculator(i);
+                      const IndexType stride,
+                      const IndexType N,
+                      phi::funcs::IndexCalculator<IndexType> index_calculator) {
+  CUDA_KERNEL_LOOP_TYPE(i, N, IndexType) {
+    IndexType offset = index_calculator(i);
 
-    auto pos0 = offset + 0 * stride;
-    auto pos1 = offset + 1 * stride;
-    auto pos2 = offset + 2 * stride;
+    IndexType pos0 = offset + 0 * stride;
+    IndexType pos1 = offset + 1 * stride;
+    IndexType pos2 = offset + 2 * stride;
 
     using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
 
@@ -149,22 +149,37 @@ void CrossKernel(const Context& dev_ctx,
   const auto* input_x_data = input_x.data<T>();
   const auto* input_y_data = input_y.data<T>();
   auto* out_data = dev_ctx.template Alloc<T>(out);
-  auto index_calculator = phi::funcs::IndexCalculator<int>(
-      merged_dims.size() - 1, cal_dims, left_strides, full_strides);
 
   int64_t numel = x.numel();
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel / 3);
 
-  Cross<<<config.block_per_grid,
-          config.thread_per_block,
-          0,
-          dev_ctx.stream()>>>(input_x_data,
-                              input_y_data,
-                              out_data,
-                              full_strides[merge_axis],
-                              numel / 3,
-                              index_calculator);
+  constexpr int64_t int_max = std::numeric_limits<int>::max();
+  if (full_strides[merge_axis] * 2 > int_max || numel / 3 > int_max) {
+    auto index_calculator = phi::funcs::IndexCalculator<int64_t>(
+        merged_dims.size() - 1, cal_dims, left_strides, full_strides);
+    Cross<<<config.block_per_grid,
+            config.thread_per_block,
+            0,
+            dev_ctx.stream()>>>(input_x_data,
+                                input_y_data,
+                                out_data,
+                                full_strides[merge_axis],
+                                numel / 3,
+                                index_calculator);
+  } else {
+    auto index_calculator = phi::funcs::IndexCalculator<int32_t>(
+        merged_dims.size() - 1, cal_dims, left_strides, full_strides);
+    Cross<<<config.block_per_grid,
+            config.thread_per_block,
+            0,
+            dev_ctx.stream()>>>(input_x_data,
+                                input_y_data,
+                                out_data,
+                                static_cast<int32_t>(full_strides[merge_axis]),
+                                static_cast<int32_t>(numel / 3),
+                                index_calculator);
+  }
 }
 }  // namespace phi