From 55c154360fb8527cb36044a16fb69e067141a0e7 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 5 Sep 2022 06:47:53 +0000
Subject: [PATCH 01/12] support int64 non-broadcast

---
 paddle/phi/kernels/funcs/broadcast_function.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 9b9d9e1d20e12..40e2e39f024ee 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -580,10 +580,24 @@ void BroadcastKernel(const KPDevice &ctx,
                      Functor func) {
   std::vector<int> dims_size;
   dims_size.reserve(ins.size());
+
+  bool no_broadcast_flag = true;
   for (auto *in : ins) {
+    no_broadcast_flag &= (in->dims() == ins[0]->dims());
     dims_size.emplace_back(in->dims().size());
   }
 
+  if (!ins.empty()) {
+    for (auto *out : *outs) {
+      no_broadcast_flag &= (out->dims() == ins[0]->dims());
+    }
+  }
+
+  if (no_broadcast_flag) {
+    phi::funcs::ElementwiseKernel<OutT, Functor, NumOuts>(ctx, ins, outs, func);
+    return;
+  }
+
   axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) -
                           *std::min_element(dims_size.begin(), dims_size.end())
                     : axis;

From b7b010005f1a4e3d97b7f046b1781e96d3709a8e Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 7 Sep 2022 01:30:01 +0000
Subject: [PATCH 02/12] support broadcast case for int64 index

---
 .../platform/device/gpu/cuda/cuda_helper.h    |  11 +-
 .../platform/device/gpu/rocm/rocm_helper.h    |   3 +-
 paddle/phi/backends/gpu/cuda/cuda_helper.h    |  11 +-
 paddle/phi/backends/gpu/rocm/rocm_helper.h    |   3 +-
 paddle/phi/kernels/funcs/broadcast_function.h | 286 ++++++++++++++++++
 5 files changed, 302 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index d1d33d50a5dbb..fa21a5f096611 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -70,11 +70,12 @@ namespace platform {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                  \
-  int64_t __index__ =                                              \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);                \
-       __index__ += blockDim.x * gridDim.x, i = __index__)
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
 
 class CublasHandleHolder {
  public:
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
index 8bcae15d3517e..45eba2b1537c8 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
@@ -70,8 +70,9 @@ namespace platform {
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
   int64_t __index__ =                                                       \
       static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
   for (index_type i = __index__; __index__ < (num);                         \
-       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+       __index__ += __stride__, i = __index__)
 
 class CublasHandleHolder {
  public:
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index 6d33d802b1880..7463edc5d9ff6 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -62,11 +62,12 @@ namespace gpu {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                  \
-  int64_t __index__ =                                              \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);                \
-       __index__ += blockDim.x * gridDim.x, i = __index__)
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
 
 }  // namespace gpu
 }  // namespace backends
diff --git a/paddle/phi/backends/gpu/rocm/rocm_helper.h b/paddle/phi/backends/gpu/rocm/rocm_helper.h
index e25dea28e36c1..07fdde5a2f417 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_helper.h
+++ b/paddle/phi/backends/gpu/rocm/rocm_helper.h
@@ -65,8 +65,9 @@ namespace gpu {
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
   int64_t __index__ =                                                       \
       static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
   for (index_type i = __index__; __index__ < (num);                         \
-       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+       __index__ += __stride__, i = __index__)
 
 }  // namespace gpu
 }  // namespace backends
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 40e2e39f024ee..aa2304e38463a 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -224,6 +224,7 @@ struct DimensionsTransform {
 };
 
 template <typename InT, typename OutT, int NumOuts = 1>
+
 int GetVecsize(const std::vector<const DenseTensor *> &ins,
                std::vector<DenseTensor *> *outs) {
   int in_vec_size = 4;
@@ -468,6 +469,233 @@ void LaunchBroadcastKernel(
                                        func);
 }
 
+#ifndef PADDLE_WITH_XPU_KP
+HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
+    int64_t src_idx,
+    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
+    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &dst_strides,
+    int rank) {
+  int64_t dst_idx = 0;
+  int64_t old_src_idx = src_idx;
+  for (int k = 0; k < rank; ++k) {
+    auto local_idx = src_idx / src_strides[k + 1];
+    src_idx -= local_idx * src_strides[k + 1];
+
+    if (dst_strides[k] != dst_strides[k + 1]) {
+      dst_idx += local_idx * dst_strides[k + 1];
+    }
+  }
+  return dst_idx;
+}
+
+template <typename T, int VecSize>
+HOSTDEVICE static void ReadVecDataWithInt64Index(
+    const T *in,
+    int64_t idx,
+    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
+    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &dst_strides,
+    int rank,
+    phi::AlignedVector<T, VecSize> *out) {
+  if (src_strides[0] == dst_strides[0]) {
+    phi::Load<T, VecSize>(in + idx, out);
+  } else {
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      (*out)[i] =
+          in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)];
+    }
+  }
+}
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+__global__ void BinaryBroadcastKernelWithInt64Index(
+    const InT *x,
+    const InT *y,
+    OutT *z,
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> x_strides,
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> y_strides,
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> z_strides,
+    int rank,
+    Functor functor) {
+  int64_t numel = z_strides[0];
+  int64_t idx =
+      (static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x) * VecSize;
+  int64_t stride = static_cast<int64_t>(blockDim.x) * gridDim.x * VecSize;
+  int64_t limit = numel - VecSize;
+
+  for (; idx <= limit; idx += stride) {
+    phi::AlignedVector<InT, VecSize> x_vec, y_vec;
+    phi::AlignedVector<OutT, VecSize> z_vec;
+    ReadVecDataWithInt64Index<InT, VecSize>(
+        x, idx, z_strides, x_strides, rank, &x_vec);
+    ReadVecDataWithInt64Index<InT, VecSize>(
+        y, idx, z_strides, y_strides, rank, &y_vec);
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      z_vec[i] = functor(x_vec[i], y_vec[i]);
+    }
+    phi::Store<OutT, VecSize>(z_vec, z + idx);
+  }
+
+  for (; idx < numel; ++idx) {
+    z[idx] = functor(x[idx], y[idx]);
+  }
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
+struct LaunchBroadcastKernelWithInt64IndexHelper {
+  static void Run(const KPDevice &ctx,
+                  const std::vector<const DenseTensor *> &ins,
+                  std::vector<DenseTensor *> *outs,
+                  int axis,
+                  Functor func) {
+    PADDLE_THROW(phi::errors::PermissionDenied(
+        "Unreachable code branch. This may be a bug."));
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
+                                                 OutT,
+                                                 Functor,
+                                                 /*Arity=*/2,
+                                                 /*NumOuts=*/1,
+                                                 VecSize> {
+  static void Run(const KPDevice &ctx,
+                  const std::vector<const DenseTensor *> &ins,
+                  std::vector<DenseTensor *> *outs,
+                  int axis,
+                  Functor func) {
+    const auto *x = ins[0], *y = ins[1];
+    auto *z = (*outs)[0];
+    const auto *x_data = x->data<InT>();
+    const auto *y_data = y->data<InT>();
+    auto *z_data = ctx.template Alloc<OutT>(z);
+
+    phi::Array<int64_t, phi::DDim::kMaxRank> x_out_dims, y_out_dims,
+        broadcast_out_dims;
+    int rank;
+    CalculateBroadcastDims(x->dims(),
+                           y->dims(),
+                           axis,
+                           &x_out_dims,
+                           &y_out_dims,
+                           &broadcast_out_dims,
+                           &rank);
+
+    auto x_strides = ShapeToStride(x_out_dims, rank);
+    auto y_strides = ShapeToStride(y_out_dims, rank);
+    auto z_strides = ShapeToStride(broadcast_out_dims, rank);
+    int64_t numel = z_strides[0];
+    auto gpu_config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
+
+    BinaryBroadcastKernelWithInt64Index<InT, OutT, Functor, VecSize>
+        <<<gpu_config.block_per_grid,
+           gpu_config.thread_per_block,
+           0,
+           ctx.stream()>>>(x_data,
+                           y_data,
+                           z_data,
+                           x_strides,
+                           y_strides,
+                           z_strides,
+                           rank,
+                           func);
+  }
+
+ private:
+  static void CalculateBroadcastDims(
+      const phi::DDim &x_dims,
+      const phi::DDim &y_dims,
+      int axis,
+      phi::Array<int64_t, phi::DDim::kMaxRank> *x_out_dims,
+      phi::Array<int64_t, phi::DDim::kMaxRank> *y_out_dims,
+      phi::Array<int64_t, phi::DDim::kMaxRank> *broadcast_out_dims,
+      int *length) {
+    int nx = x_dims.size(), ny = y_dims.size();
+    PADDLE_ENFORCE_GE(
+        axis, 0, phi::errors::InvalidArgument("Invalid axis value: %d", axis));
+    if (nx == ny) {
+      *length = nx;
+      for (int i = 0; i < nx; ++i) {
+        if (x_dims[i] != y_dims[i]) {
+          PADDLE_ENFORCE_EQ(
+              x_dims[i] == 1 || y_dims[i] == 1,
+              true,
+              phi::errors::InvalidArgument("Cannot broadcast input shape where "
+                                           "x_dims[%d] = %d, y_dims[%d] = %d.",
+                                           i,
+                                           x_dims[i],
+                                           i,
+                                           y_dims[i]));
+        }
+        (*broadcast_out_dims)[i] = std::max(x_dims[i], y_dims[i]);
+        (*x_out_dims)[i] = x_dims[i];
+        (*y_out_dims)[i] = y_dims[i];
+      }
+    } else if (nx > ny) {
+      *length = nx;
+      for (int i = nx - axis; i < ny; ++i) {
+        PADDLE_ENFORCE_EQ(
+            y_dims[i],
+            1,
+            phi::errors::InvalidArgument(
+                "The trailing Y.shape[%d] should be 1 but got %d.",
+                i,
+                y_dims[i]));
+      }
+
+      for (int i = 0; i < nx; ++i) {
+        if (i >= axis && i - axis < ny) {
+          if (x_dims[i] != y_dims[i - axis]) {
+            PADDLE_ENFORCE_EQ(x_dims[i] == 1 || y_dims[i - axis] == 1,
+                              true,
+                              phi::errors::InvalidArgument(
+                                  "Cannot broadcast input shape where "
+                                  "x_dims[%d] = %d, y_dims[%d] = %d.",
+                                  i,
+                                  x_dims[i],
+                                  i - axis,
+                                  y_dims[i - axis]));
+          }
+          (*broadcast_out_dims)[i] = std::max(x_dims[i], y_dims[i - axis]);
+          (*x_out_dims)[i] = x_dims[i];
+          (*y_out_dims)[i] = y_dims[i - axis];
+        } else {
+          (*broadcast_out_dims)[i] = x_dims[i];
+          (*x_out_dims)[i] = x_dims[i];
+          (*y_out_dims)[i] = 1;
+        }
+      }
+    } else {
+      CalculateBroadcastDims(y_dims,
+                             x_dims,
+                             axis,
+                             y_out_dims,
+                             x_out_dims,
+                             broadcast_out_dims,
+                             length);
+    }
+  }
+
+  static phi::Array<int64_t, phi::DDim::kMaxRank + 1> ShapeToStride(
+      const phi::Array<int64_t, phi::DDim::kMaxRank> &arr, int rank) {
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> strides;
+    strides[rank] = 1;
+    for (int i = rank - 1; i >= 0; --i) {
+      strides[i] = strides[i + 1] * arr[i];
+    }
+    return strides;
+  }
+};
+#endif
+
 template <ElementwiseType ET,
           typename InT,
           typename OutT,
@@ -509,6 +737,64 @@ void BroadcastKernelForDifferentVecSize(
                                    outs->size(),
                                    NumOuts));
 
+#ifndef PADDLE_WITH_XPU_KP
+  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity == 2);
+  bool use_int64_index_kernel =
+      kEnabledInt64IndexKernel &&
+      (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
+  use_int64_index_kernel = kEnabledInt64IndexKernel;
+  if (use_int64_index_kernel) {
+    int vec_size = GetVecsize<InT, OutT, NumOuts>(ins, outs);
+    switch (vec_size) {
+      case VecSizeL: {
+        LaunchBroadcastKernelWithInt64IndexHelper<InT,
+                                                  OutT,
+                                                  Functor,
+                                                  kArity,
+                                                  NumOuts,
+                                                  VecSizeL>::Run(ctx,
+                                                                 ins,
+                                                                 outs,
+                                                                 axis,
+                                                                 func);
+        break;
+      }
+      case VecSizeM: {
+        LaunchBroadcastKernelWithInt64IndexHelper<InT,
+                                                  OutT,
+                                                  Functor,
+                                                  kArity,
+                                                  NumOuts,
+                                                  VecSizeM>::Run(ctx,
+                                                                 ins,
+                                                                 outs,
+                                                                 axis,
+                                                                 func);
+        break;
+      }
+      case VecSizeS: {
+        LaunchBroadcastKernelWithInt64IndexHelper<InT,
+                                                  OutT,
+                                                  Functor,
+                                                  kArity,
+                                                  NumOuts,
+                                                  VecSizeS>::Run(ctx,
+                                                                 ins,
+                                                                 outs,
+                                                                 axis,
+                                                                 func);
+        break;
+      }
+      default: {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Unsupported vectorized size: %d!", vec_size));
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
   // mergedim and get vec_size
   const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
   phi::Array<kps::details::BroadcastConfig, kArity> configs;

From c86679abba3bbceb93df65e25efb26553aea7fe7 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 7 Sep 2022 05:01:33 +0000
Subject: [PATCH 03/12] fix bug

---
 paddle/phi/kernels/funcs/broadcast_function.h | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index aa2304e38463a..40c10100615c7 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -492,11 +492,12 @@ template <typename T, int VecSize>
 HOSTDEVICE static void ReadVecDataWithInt64Index(
     const T *in,
     int64_t idx,
+    bool no_broadcast,
     const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
     const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &dst_strides,
     int rank,
     phi::AlignedVector<T, VecSize> *out) {
-  if (src_strides[0] == dst_strides[0]) {
+  if (no_broadcast) {
     phi::Load<T, VecSize>(in + idx, out);
   } else {
 #pragma unroll
@@ -516,6 +517,8 @@ __global__ void BinaryBroadcastKernelWithInt64Index(
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> y_strides,
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> z_strides,
     int rank,
+    bool x_no_broadcast,
+    bool y_no_broadcast,
     Functor functor) {
   int64_t numel = z_strides[0];
   int64_t idx =
@@ -527,9 +530,9 @@ __global__ void BinaryBroadcastKernelWithInt64Index(
     phi::AlignedVector<InT, VecSize> x_vec, y_vec;
     phi::AlignedVector<OutT, VecSize> z_vec;
     ReadVecDataWithInt64Index<InT, VecSize>(
-        x, idx, z_strides, x_strides, rank, &x_vec);
+        x, idx, x_no_broadcast, z_strides, x_strides, rank, &x_vec);
     ReadVecDataWithInt64Index<InT, VecSize>(
-        y, idx, z_strides, y_strides, rank, &y_vec);
+        y, idx, y_no_broadcast, z_strides, y_strides, rank, &y_vec);
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
       z_vec[i] = functor(x_vec[i], y_vec[i]);
@@ -587,6 +590,8 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
                            &y_out_dims,
                            &broadcast_out_dims,
                            &rank);
+    bool x_no_broadcast = IsSame(x_out_dims, broadcast_out_dims, rank);
+    bool y_no_broadcast = IsSame(y_out_dims, broadcast_out_dims, rank);
 
     auto x_strides = ShapeToStride(x_out_dims, rank);
     auto y_strides = ShapeToStride(y_out_dims, rank);
@@ -606,6 +611,8 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
                            y_strides,
                            z_strides,
                            rank,
+                           x_no_broadcast,
+                           y_no_broadcast,
                            func);
   }
 
@@ -684,6 +691,15 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
     }
   }
 
+  static bool IsSame(const phi::Array<int64_t, phi::DDim::kMaxRank> &x,
+                     const phi::Array<int64_t, phi::DDim::kMaxRank> &y,
+                     int rank) {
+    for (int i = 0; i < rank; ++i) {
+      if (x[i] != y[i]) return false;
+    }
+    return true;
+  }
+
   static phi::Array<int64_t, phi::DDim::kMaxRank + 1> ShapeToStride(
       const phi::Array<int64_t, phi::DDim::kMaxRank> &arr, int rank) {
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> strides;

From 727229d8f3208c9380bf2922fff06f934bd14fa7 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 8 Sep 2022 05:55:06 +0000
Subject: [PATCH 04/12] support more Arity

---
 paddle/phi/kernels/funcs/broadcast_function.h | 316 +++++++++++++-----
 1 file changed, 228 insertions(+), 88 deletions(-)

diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 8245d6596686c..f575915eba929 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -487,62 +487,161 @@ HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
   return dst_idx;
 }
 
-template <typename T, int VecSize>
+template <typename T, int VecSize, bool IsBoundary>
 HOSTDEVICE static void ReadVecDataWithInt64Index(
     const T *in,
     int64_t idx,
-    bool no_broadcast,
+    bool need_broadcast,
     const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
     const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &dst_strides,
     int rank,
+    int n,
     phi::AlignedVector<T, VecSize> *out) {
-  if (no_broadcast) {
-    phi::Load<T, VecSize>(in + idx, out);
-  } else {
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
+  if (IsBoundary) {
+    for (int i = 0; i < n; ++i) {
       (*out)[i] =
           in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)];
     }
+  } else {
+    if (!need_broadcast) {
+      phi::Load<T, VecSize>(in + idx, out);
+    } else {
+#pragma unroll
+      for (int i = 0; i < VecSize; ++i) {
+        (*out)[i] =
+            in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)];
+      }
+    }
   }
 }
 
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int NumIns>
+struct ApplyFunctorWithInt64IndexHelper {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i);
+};
+
 template <typename InT, typename OutT, typename Functor, int VecSize>
-__global__ void BinaryBroadcastKernelWithInt64Index(
-    const InT *x,
-    const InT *y,
-    OutT *z,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> x_strides,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> y_strides,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> z_strides,
+struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 0> {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i) {
+    return static_cast<OutT>(functor());
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 1> {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i) {
+    return static_cast<OutT>(functor(ins_vec[0][i]));
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 2> {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i) {
+    return static_cast<OutT>(functor(ins_vec[0][i], ins_vec[1][i]));
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 3> {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i) {
+    return static_cast<OutT>(
+        functor(ins_vec[0][i], ins_vec[1][i], ins_vec[2][i]));
+  }
+};
+
+template <int N>
+struct MaxWithOne {
+  static constexpr auto kValue = (N >= 1 ? N : 1);
+};
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int NumIns>
+__global__ void BroadcastKernelWithInt64Index(
+    phi::Array<const InT *, MaxWithOne<NumIns>::kValue> ins,
+    OutT *out,
+    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank + 1>,
+               MaxWithOne<NumIns>::kValue> ins_strides,
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> out_strides,
+    phi::Array<bool, MaxWithOne<NumIns>::kValue> need_broadcasts,
     int rank,
-    bool x_no_broadcast,
-    bool y_no_broadcast,
     Functor functor) {
-  int64_t numel = z_strides[0];
+  int64_t numel = out_strides[0];
   int64_t idx =
       (static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x) * VecSize;
   int64_t stride = static_cast<int64_t>(blockDim.x) * gridDim.x * VecSize;
   int64_t limit = numel - VecSize;
 
+  phi::Array<phi::AlignedVector<InT, VecSize>, MaxWithOne<NumIns>::kValue>
+      ins_vec;
+  phi::AlignedVector<OutT, VecSize> out_vec;
   for (; idx <= limit; idx += stride) {
-    phi::AlignedVector<InT, VecSize> x_vec, y_vec;
-    phi::AlignedVector<OutT, VecSize> z_vec;
-    ReadVecDataWithInt64Index<InT, VecSize>(
-        x, idx, x_no_broadcast, z_strides, x_strides, rank, &x_vec);
-    ReadVecDataWithInt64Index<InT, VecSize>(
-        y, idx, y_no_broadcast, z_strides, y_strides, rank, &y_vec);
+#pragma unroll
+    for (int i = 0; i < NumIns; ++i) {
+      ReadVecDataWithInt64Index<InT, VecSize, false>(ins[i],
+                                                     idx,
+                                                     need_broadcasts[i],
+                                                     out_strides,
+                                                     ins_strides[i],
+                                                     rank,
+                                                     VecSize,
+                                                     &ins_vec[i]);
+    }
+
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
-      z_vec[i] = functor(x_vec[i], y_vec[i]);
+      out_vec[i] = ApplyFunctorWithInt64IndexHelper<InT,
+                                                    OutT,
+                                                    Functor,
+                                                    VecSize,
+                                                    NumIns>::Run(ins_vec.Get(),
+                                                                 functor,
+                                                                 i);
     }
-    phi::Store<OutT, VecSize>(z_vec, z + idx);
+
+    phi::Store<OutT, VecSize>(out_vec, out + idx);
   }
 
-  for (; idx < numel; ++idx) {
-    int64_t x_idx = ConvertSrcIdxToDstIdx(idx, z_strides, x_strides, rank);
-    int64_t y_idx = ConvertSrcIdxToDstIdx(idx, z_strides, y_strides, rank);
-    z[idx] = functor(x[x_idx], y[y_idx]);
+  if (idx < numel) {
+    int remain = numel - idx;  // remain is always less than VecSize, therefore
+                               // `int` is enough here
+#pragma unroll
+    for (int i = 0; i < NumIns; ++i) {
+      ReadVecDataWithInt64Index<InT, VecSize, true>(ins[i],
+                                                    idx,
+                                                    need_broadcasts[i],
+                                                    out_strides,
+                                                    ins_strides[i],
+                                                    rank,
+                                                    remain,
+                                                    &ins_vec[i]);
+    }
+
+    for (int i = 0; i < remain; ++i) {
+      out[idx] = ApplyFunctorWithInt64IndexHelper<InT,
+                                                  OutT,
+                                                  Functor,
+                                                  VecSize,
+                                                  NumIns>::Run(ins_vec.Get(),
+                                                               functor,
+                                                               i);
+    }
   }
 }
 
@@ -557,76 +656,117 @@ struct LaunchBroadcastKernelWithInt64IndexHelper {
                   const std::vector<const DenseTensor *> &ins,
                   std::vector<DenseTensor *> *outs,
                   int axis,
-                  Functor func) {
+                  Functor functor) {
     PADDLE_THROW(phi::errors::PermissionDenied(
         "Unreachable code branch. This may be a bug."));
   }
 };
 
-template <typename InT, typename OutT, typename Functor, int VecSize>
+template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
 struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
                                                  OutT,
                                                  Functor,
-                                                 /*Arity=*/2,
+                                                 Arity,
                                                  /*NumOuts=*/1,
                                                  VecSize> {
   static void Run(const KPDevice &ctx,
                   const std::vector<const DenseTensor *> &ins,
                   std::vector<DenseTensor *> *outs,
                   int axis,
-                  Functor func) {
-    const auto *x = ins[0], *y = ins[1];
-    auto *z = (*outs)[0];
-    const auto *x_data = x->data<InT>();
-    const auto *y_data = y->data<InT>();
-    auto *z_data = ctx.template Alloc<OutT>(z);
-
-    phi::Array<int64_t, phi::DDim::kMaxRank> x_out_dims, y_out_dims,
-        broadcast_out_dims;
+                  Functor functor) {
+    phi::Array<const InT *, MaxWithOne<Arity>::kValue> ins_ptrs;
+    for (int i = 0; i < Arity; ++i) {
+      ins_ptrs[i] = ins[i]->data<InT>();
+    }
+    auto *out_tensor = (*outs)[0];
+    auto *out_ptr = ctx.Alloc<OutT>(out_tensor);
+
+    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank>, phi::DDim::kMaxRank>
+        ins_expand_dims;
+    phi::Array<int64_t, phi::DDim::kMaxRank> broadcast_out_dims;
     int rank;
-    CalculateBroadcastDims(x->dims(),
-                           y->dims(),
-                           axis,
-                           &x_out_dims,
-                           &y_out_dims,
-                           &broadcast_out_dims,
-                           &rank);
-    bool x_no_broadcast = IsSame(x_out_dims, broadcast_out_dims, rank);
-    bool y_no_broadcast = IsSame(y_out_dims, broadcast_out_dims, rank);
-
-    auto x_strides = ShapeToStride(x_out_dims, rank);
-    auto y_strides = ShapeToStride(y_out_dims, rank);
-    auto z_strides = ShapeToStride(broadcast_out_dims, rank);
-    int64_t numel = z_strides[0];
+    if (Arity == 1) {
+      rank = ins[0]->dims().size();
+      for (int i = 0; i < rank; ++i) {
+        broadcast_out_dims[i] = ins[0]->dims()[i];
+      }
+      ins_expand_dims[0] = broadcast_out_dims;
+    } else if (Arity >= 2) {
+      phi::Array<int64_t, phi::DDim::kMaxRank>
+          ins_expand_dims[MaxWithOne<Arity>::kValue];
+      CalculateBroadcastDims(ins[0]->dims().Get(),
+                             ins[1]->dims().Get(),
+                             ins[0]->dims().size(),
+                             ins[1]->dims().size(),
+                             axis,
+                             ins_expand_dims[0].GetMutable(),
+                             ins_expand_dims[1].GetMutable(),
+                             broadcast_out_dims.GetMutable(),
+                             &rank);
+      for (int i = 2; i < Arity; ++i) {
+        auto tmp_dims = broadcast_out_dims;
+        phi::Array<int64_t, phi::DDim::kMaxRank> tmp_expand_dims;
+        int tmp_rank;
+        PADDLE_ENFORCE_GE(rank,
+                          ins[i]->dims().size(),
+                          phi::errors::InvalidArgument(
+                              "Unsupported reverse broadcast when the input "
+                              "tensor number is larger than 2."));
+        CalculateBroadcastDims(tmp_dims.Get(),
+                               ins[i]->dims().Get(),
+                               rank,
+                               ins[i]->dims().size(),
+                               axis,
+                               tmp_expand_dims.GetMutable(),
+                               ins_expand_dims[i].GetMutable(),
+                               broadcast_out_dims.GetMutable(),
+                               &tmp_rank);
+        PADDLE_ENFORCE_EQ(rank,
+                          tmp_rank,
+                          phi::errors::InvalidArgument(
+                              "Wrong broadcast algorithm. This may be a bug."));
+      }
+    }
+
+    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank + 1>,
+               MaxWithOne<Arity>::kValue>
+        ins_strides;
+    phi::Array<bool, MaxWithOne<Arity>::kValue> need_broadcasts;
+
+    auto out_strides = ShapeToStride(broadcast_out_dims.Get(), rank);
+    for (int i = 0; i < Arity; ++i) {
+      ins_strides[i] = ShapeToStride(ins_expand_dims[i].Get(), rank);
+      need_broadcasts[i] =
+          !IsSameShape(out_strides.Get(), ins_strides[i].Get(), rank + 1);
+    }
+
+    int64_t numel = out_strides[0];
     auto gpu_config =
         phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
 
-    BinaryBroadcastKernelWithInt64Index<InT, OutT, Functor, VecSize>
+    BroadcastKernelWithInt64Index<InT, OutT, Functor, VecSize, Arity>
         <<<gpu_config.block_per_grid,
            gpu_config.thread_per_block,
            0,
-           ctx.stream()>>>(x_data,
-                           y_data,
-                           z_data,
-                           x_strides,
-                           y_strides,
-                           z_strides,
+           ctx.stream()>>>(ins_ptrs,
+                           out_ptr,
+                           ins_strides,
+                           out_strides,
+                           need_broadcasts,
                            rank,
-                           x_no_broadcast,
-                           y_no_broadcast,
-                           func);
+                           functor);
   }
 
  private:
-  static void CalculateBroadcastDims(
-      const phi::DDim &x_dims,
-      const phi::DDim &y_dims,
-      int axis,
-      phi::Array<int64_t, phi::DDim::kMaxRank> *x_out_dims,
-      phi::Array<int64_t, phi::DDim::kMaxRank> *y_out_dims,
-      phi::Array<int64_t, phi::DDim::kMaxRank> *broadcast_out_dims,
-      int *length) {
-    int nx = x_dims.size(), ny = y_dims.size();
+  static void CalculateBroadcastDims(const int64_t *x_dims,
+                                     const int64_t *y_dims,
+                                     int nx,
+                                     int ny,
+                                     int axis,
+                                     int64_t *x_out_dims,
+                                     int64_t *y_out_dims,
+                                     int64_t *broadcast_out_dims,
+                                     int *length) {
     PADDLE_ENFORCE_GE(
         axis, 0, phi::errors::InvalidArgument("Invalid axis value: %d", axis));
     if (nx == ny) {
@@ -643,9 +783,9 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
                                            i,
                                            y_dims[i]));
         }
-        (*broadcast_out_dims)[i] = std::max(x_dims[i], y_dims[i]);
-        (*x_out_dims)[i] = x_dims[i];
-        (*y_out_dims)[i] = y_dims[i];
+        broadcast_out_dims[i] = std::max(x_dims[i], y_dims[i]);
+        x_out_dims[i] = x_dims[i];
+        y_out_dims[i] = y_dims[i];
       }
     } else if (nx > ny) {
       *length = nx;
@@ -672,18 +812,20 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
                                   i - axis,
                                   y_dims[i - axis]));
           }
-          (*broadcast_out_dims)[i] = std::max(x_dims[i], y_dims[i - axis]);
-          (*x_out_dims)[i] = x_dims[i];
-          (*y_out_dims)[i] = y_dims[i - axis];
+          broadcast_out_dims[i] = std::max(x_dims[i], y_dims[i - axis]);
+          x_out_dims[i] = x_dims[i];
+          y_out_dims[i] = y_dims[i - axis];
         } else {
-          (*broadcast_out_dims)[i] = x_dims[i];
-          (*x_out_dims)[i] = x_dims[i];
-          (*y_out_dims)[i] = 1;
+          broadcast_out_dims[i] = x_dims[i];
+          x_out_dims[i] = x_dims[i];
+          y_out_dims[i] = 1;
         }
       }
     } else {
       CalculateBroadcastDims(y_dims,
                              x_dims,
+                             ny,
+                             nx,
                              axis,
                              y_out_dims,
                              x_out_dims,
@@ -692,9 +834,7 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
     }
   }
 
-  static bool IsSame(const phi::Array<int64_t, phi::DDim::kMaxRank> &x,
-                     const phi::Array<int64_t, phi::DDim::kMaxRank> &y,
-                     int rank) {
+  static bool IsSameShape(const int64_t *x, const int64_t *y, int rank) {
     for (int i = 0; i < rank; ++i) {
       if (x[i] != y[i]) return false;
     }
@@ -702,7 +842,7 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
   }
 
   static phi::Array<int64_t, phi::DDim::kMaxRank + 1> ShapeToStride(
-      const phi::Array<int64_t, phi::DDim::kMaxRank> &arr, int rank) {
+      const int64_t *arr, int rank) {
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> strides;
     strides[rank] = 1;
     for (int i = rank - 1; i >= 0; --i) {
@@ -755,7 +895,7 @@ void BroadcastKernelForDifferentVecSize(
                                    NumOuts));
 
 #ifndef PADDLE_WITH_XPU_KP
-  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity == 2);
+  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity <= 3);
   bool use_int64_index_kernel =
       kEnabledInt64IndexKernel &&
       (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();

From c081861917045b0ddea880aa04fe0796b9144aae Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 8 Sep 2022 06:13:36 +0000
Subject: [PATCH 05/12] remove some codes

---
 paddle/phi/kernels/funcs/broadcast_function.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index f575915eba929..683806c1e5f47 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -1023,24 +1023,10 @@ void BroadcastKernel(const KPDevice &ctx,
                      Functor func) {
   std::vector<int> dims_size;
   dims_size.reserve(ins.size());
-
-  bool no_broadcast_flag = true;
   for (auto *in : ins) {
-    no_broadcast_flag &= (in->dims() == ins[0]->dims());
     dims_size.emplace_back(in->dims().size());
   }
 
-  if (!ins.empty()) {
-    for (auto *out : *outs) {
-      no_broadcast_flag &= (out->dims() == ins[0]->dims());
-    }
-  }
-
-  if (no_broadcast_flag) {
-    phi::funcs::ElementwiseKernel<OutT, Functor, NumOuts>(ctx, ins, outs, func);
-    return;
-  }
-
   axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) -
                           *std::min_element(dims_size.begin(), dims_size.end())
                     : axis;

From 4535d32422186f6d289a78e3eae5882ca499657f Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 8 Sep 2022 07:09:37 +0000
Subject: [PATCH 06/12] upgrade patchelf to v0.15.0 to pass CI build

---
 tools/dockerfile/Dockerfile.release16         | 13 ++++----
 tools/dockerfile/Dockerfile.release18         | 12 +++-----
 tools/dockerfile/Dockerfile.ubuntu            | 14 ++++-----
 tools/dockerfile/Dockerfile.ubuntu18          | 12 +++-----
 tools/dockerfile/build_scripts/build.sh       |  2 +-
 .../build_scripts/install_patchelf.sh         | 30 +++++++++++++++++++
 6 files changed, 51 insertions(+), 32 deletions(-)
 create mode 100644 tools/dockerfile/build_scripts/install_patchelf.sh

diff --git a/tools/dockerfile/Dockerfile.release16 b/tools/dockerfile/Dockerfile.release16
index 66974f46d91e4..49637a3fee9a4 100644
--- a/tools/dockerfile/Dockerfile.release16
+++ b/tools/dockerfile/Dockerfile.release16
@@ -101,6 +101,12 @@ RUN curl -s -q https://glide.sh/get | sh
 # Downgrade TensorRT 
 COPY tools/dockerfile/build_scripts /build_scripts
 RUN bash /build_scripts/install_nccl2.sh
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# # So install a newer version here.
+RUN bash /build_scripts/install_patchelf.sh
+
 RUN rm -rf /build_scripts
 
 # git credential to skip password typing
@@ -143,13 +149,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
 
 RUN apt-get install libprotobuf-dev -y
 
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb && \
-    rm -rf patchelf_0.10-2_amd64.deb
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 CMD source ~/.bashrc
diff --git a/tools/dockerfile/Dockerfile.release18 b/tools/dockerfile/Dockerfile.release18
index d646f41b00d0b..cf343873d943a 100644
--- a/tools/dockerfile/Dockerfile.release18
+++ b/tools/dockerfile/Dockerfile.release18
@@ -28,6 +28,10 @@ RUN apt-get update && \
 # Downgrade gcc&&g++
 WORKDIR /usr/bin 
 COPY tools/dockerfile/build_scripts /build_scripts 
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# # So install a newer version here.
+RUN bash /build_scripts/install_patchelf.sh
 RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
 RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
 RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
@@ -99,14 +103,6 @@ RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
 COPY ./python/requirements.txt /root/
 RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
 
-
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb && \
-    rm -rf patchelf_0.10-2_amd64.deb
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 #CMD source ~/.bashrc
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 7e0c3a62b1d50..7c9636d57ddf5 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -143,9 +143,14 @@ RUN curl -s -q https://glide.sh/get | sh
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
 
 # Downgrade TensorRT 
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# # So install a newer version here.
 COPY tools/dockerfile/build_scripts /build_scripts
 RUN bash /build_scripts/install_trt.sh && \
-    bash /build_scripts/install_nccl2.sh
+    bash /build_scripts/install_nccl2.sh && \
+    bash /build_scripts/install_patchelf.sh
 RUN rm -rf /build_scripts
 
 # git credential to skip password typing
@@ -236,13 +241,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
 
 RUN apt-get install libprotobuf-dev -y
 
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb && \
-    rm -rf patchelf_0.10-2_amd64.deb
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 CMD source ~/.bashrc
diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18
index a5dba053b98b2..8ebfd9b8371c2 100644
--- a/tools/dockerfile/Dockerfile.ubuntu18
+++ b/tools/dockerfile/Dockerfile.ubuntu18
@@ -35,6 +35,10 @@ RUN apt-get update --allow-unauthenticated && \
 WORKDIR /usr/bin 
 COPY tools/dockerfile/build_scripts /build_scripts 
 RUN bash /build_scripts/install_trt.sh
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# # So install a newer version here.
+RUN bash /build_scripts/install_patchelf.sh
 RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
 RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
 RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
@@ -151,14 +155,6 @@ RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
     pip3.9 --no-cache-dir install -r /root/requirements.txt
 
-
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb && \
-    rm -rf patchelf_0.10-2_amd64.deb
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 #CMD source ~/.bashrc
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index 92d1c12d2bc41..61bcc1f103563 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -106,7 +106,7 @@ export SSL_CERT_FILE=/opt/_internal/certs.pem
 # tar -xzf patchelf-0.9njs2.tar.gz
 # (cd patchelf-0.9njs2 && ./configure && make && make install)
 # rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
-yum install -y patchelf
+sh "$MY_DIR/install_patchelf.sh"
 
 # Install latest pypi release of auditwheel
 #LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
diff --git a/tools/dockerfile/build_scripts/install_patchelf.sh b/tools/dockerfile/build_scripts/install_patchelf.sh
new file mode 100644
index 0000000000000..4de9c6352f2ea
--- /dev/null
+++ b/tools/dockerfile/build_scripts/install_patchelf.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+TMP_DIR=patchelf_tmp
+
+rm -rf "$TMP_DIR"
+git clone -b 0.15.0 https://github.com/NixOS/patchelf "$TMP_DIR"
+
+cd "$TMP_DIR"
+./bootstrap.sh
+./configure
+make
+make check 
+make install
+
+cd ..
+rm -rf "$TMP_DIR" 

From d0d3981b711ee842f402f6b438f54f797d38f11e Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 8 Sep 2022 08:09:57 +0000
Subject: [PATCH 07/12] fix bug

---
 paddle/phi/kernels/funcs/broadcast_function.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 683806c1e5f47..624be323cd57c 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -681,7 +681,8 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
     auto *out_tensor = (*outs)[0];
     auto *out_ptr = ctx.Alloc<OutT>(out_tensor);
 
-    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank>, phi::DDim::kMaxRank>
+    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank>,
+               MaxWithOne<Arity>::kValue>
         ins_expand_dims;
     phi::Array<int64_t, phi::DDim::kMaxRank> broadcast_out_dims;
     int rank;
@@ -692,8 +693,6 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
       }
       ins_expand_dims[0] = broadcast_out_dims;
     } else if (Arity >= 2) {
-      phi::Array<int64_t, phi::DDim::kMaxRank>
-          ins_expand_dims[MaxWithOne<Arity>::kValue];
       CalculateBroadcastDims(ins[0]->dims().Get(),
                              ins[1]->dims().Get(),
                              ins[0]->dims().size(),

From 914642d67d28d4afb5a9fe53fe4dc5f0887abd79 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 9 Sep 2022 01:47:19 +0000
Subject: [PATCH 08/12] fix patchelf installation

---
 tools/dockerfile/build_scripts/install_patchelf.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/dockerfile/build_scripts/install_patchelf.sh b/tools/dockerfile/build_scripts/install_patchelf.sh
index 4de9c6352f2ea..9fda46e5b6f86 100644
--- a/tools/dockerfile/build_scripts/install_patchelf.sh
+++ b/tools/dockerfile/build_scripts/install_patchelf.sh
@@ -23,7 +23,6 @@ cd "$TMP_DIR"
 ./bootstrap.sh
 ./configure
 make
-make check 
 make install
 
 cd ..

From 1f018cbc595de6578b7763bd9db6c79a1029c9a6 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 13 Sep 2022 07:54:09 +0000
Subject: [PATCH 09/12] add debug flags

---
 paddle/fluid/platform/flags.cc                |  2 +
 paddle/phi/kernels/funcs/broadcast_function.h | 71 ++++++++++++++++---
 2 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 28dddc1fbebdd..2f2eae6add537 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -1023,3 +1023,5 @@ PADDLE_DEFINE_EXPORTED_bool(
 PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
                               "PE",
                               "Choose default funciton type in JitLayer.");
+
+PADDLE_DEFINE_EXPORTED_bool(use_int32_kernel, false, "");
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 624be323cd57c..eb4a332887e9c 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "gflags/gflags.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
@@ -22,6 +23,8 @@ namespace kps = phi::kps;
 
 #endif
 
+DECLARE_bool(use_int32_kernel);
+
 namespace phi {
 namespace funcs {
 
@@ -469,6 +472,14 @@ void LaunchBroadcastKernel(
 }
 
 #ifndef PADDLE_WITH_XPU_KP
+#define CUDA_ASSERT(__cond) \
+  do {                      \
+    if (!(__cond)) {        \
+      printf(#__cond);      \
+      asm("trap;");         \
+    }                       \
+  } while (0)
+
 HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
     int64_t src_idx,
     const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
@@ -484,6 +495,8 @@ HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
       dst_idx += local_idx * dst_strides[k + 1];
     }
   }
+  CUDA_ASSERT(src_idx >= 0 && src_idx < src_strides[0]);
+  CUDA_ASSERT(dst_idx >= 0 && dst_idx < dst_strides[0]);
   return dst_idx;
 }
 
@@ -504,6 +517,16 @@ HOSTDEVICE static void ReadVecDataWithInt64Index(
     }
   } else {
     if (!need_broadcast) {
+#pragma unroll
+      for (int i = 0; i < VecSize; ++i) {
+        CUDA_ASSERT(idx + i >= 0 && idx + i < src_strides[0]);
+      }
+      for (int i = 0; i < rank; ++i) {
+        CUDA_ASSERT(src_strides[i] == dst_strides[i]);
+      }
+      CUDA_ASSERT(src_strides[rank] == 1);
+      CUDA_ASSERT(dst_strides[rank] == 1);
+
       phi::Load<T, VecSize>(in + idx, out);
     } else {
 #pragma unroll
@@ -634,13 +657,14 @@ __global__ void BroadcastKernelWithInt64Index(
     }
 
     for (int i = 0; i < remain; ++i) {
-      out[idx] = ApplyFunctorWithInt64IndexHelper<InT,
-                                                  OutT,
-                                                  Functor,
-                                                  VecSize,
-                                                  NumIns>::Run(ins_vec.Get(),
-                                                               functor,
-                                                               i);
+      out[idx + i] =
+          ApplyFunctorWithInt64IndexHelper<InT,
+                                           OutT,
+                                           Functor,
+                                           VecSize,
+                                           NumIns>::Run(ins_vec.Get(),
+                                                        functor,
+                                                        i);
     }
   }
 }
@@ -732,7 +756,12 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
         ins_strides;
     phi::Array<bool, MaxWithOne<Arity>::kValue> need_broadcasts;
 
-    auto out_strides = ShapeToStride(broadcast_out_dims.Get(), rank);
+    PADDLE_ENFORCE_EQ(
+        rank,
+        out_tensor->dims().size(),
+        phi::errors::InvalidArgument(
+            "Output tensor's rank does not match. This may be a bug."));
+    auto out_strides = ShapeToStride(out_tensor->dims().Get(), rank);
     for (int i = 0; i < Arity; ++i) {
       ins_strides[i] = ShapeToStride(ins_expand_dims[i].Get(), rank);
       need_broadcasts[i] =
@@ -743,6 +772,28 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
     auto gpu_config =
         phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
 
+    if (VLOG_IS_ON(10)) {
+      VLOG(10) << "-------------------------------------------------";
+      VLOG(10) << "Arity = " << Arity << " , VecSize = " << VecSize
+               << " , rank = " << rank << " , Functor = "
+               << phi::enforce::demangle(typeid(Functor).name())
+               << " , InT = " << phi::enforce::demangle(typeid(InT).name())
+               << " , OutT = " << phi::enforce::demangle(typeid(OutT).name());
+      for (int i = 0; i < Arity; ++i) {
+        VLOG(10) << "in " << i << " : dims = " << ins[i]->dims()
+                 << " broadcast_dims = "
+                 << phi::DDim(ins_expand_dims[i].Get(), rank)
+                 << " strides = " << phi::DDim(ins_strides[i].Get(), rank + 1)
+                 << " need_broadcast = " << need_broadcasts[i];
+      }
+      VLOG(10) << "out : dims = " << (*outs)[0]->dims()
+               << " strides = " << phi::DDim(out_strides.Get(), rank + 1);
+      VLOG(10) << "gpu_config.block_per_grid = " << gpu_config.block_per_grid.x
+               << " , gpu_config.thread_per_block = "
+               << gpu_config.thread_per_block.x;
+      VLOG(10) << "-------------------------------------------------";
+    }
+
     BroadcastKernelWithInt64Index<InT, OutT, Functor, VecSize, Arity>
         <<<gpu_config.block_per_grid,
            gpu_config.thread_per_block,
@@ -898,7 +949,9 @@ void BroadcastKernelForDifferentVecSize(
   bool use_int64_index_kernel =
       kEnabledInt64IndexKernel &&
       (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
-  use_int64_index_kernel = kEnabledInt64IndexKernel;
+  if (!FLAGS_use_int32_kernel) {
+    use_int64_index_kernel = kEnabledInt64IndexKernel;
+  }
   if (use_int64_index_kernel) {
     int vec_size = GetVecsize<InT, OutT>(ins, outs);
     switch (vec_size) {

From 0358f919ca8eb2145116667e922b8e00923b68a3 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 13 Sep 2022 07:55:03 +0000
Subject: [PATCH 10/12] remove useless codes

---
 paddle/fluid/platform/flags.cc                |  2 -
 paddle/phi/kernels/funcs/broadcast_function.h | 48 +------------------
 2 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 2f2eae6add537..28dddc1fbebdd 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -1023,5 +1023,3 @@ PADDLE_DEFINE_EXPORTED_bool(
 PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
                               "PE",
                               "Choose default funciton type in JitLayer.");
-
-PADDLE_DEFINE_EXPORTED_bool(use_int32_kernel, false, "");
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index eb4a332887e9c..46ec31227346b 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "gflags/gflags.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
@@ -23,8 +22,6 @@ namespace kps = phi::kps;
 
 #endif
 
-DECLARE_bool(use_int32_kernel);
-
 namespace phi {
 namespace funcs {
 
@@ -472,14 +469,6 @@ void LaunchBroadcastKernel(
 }
 
 #ifndef PADDLE_WITH_XPU_KP
-#define CUDA_ASSERT(__cond) \
-  do {                      \
-    if (!(__cond)) {        \
-      printf(#__cond);      \
-      asm("trap;");         \
-    }                       \
-  } while (0)
-
 HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
     int64_t src_idx,
     const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
@@ -495,8 +484,6 @@ HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
       dst_idx += local_idx * dst_strides[k + 1];
     }
   }
-  CUDA_ASSERT(src_idx >= 0 && src_idx < src_strides[0]);
-  CUDA_ASSERT(dst_idx >= 0 && dst_idx < dst_strides[0]);
   return dst_idx;
 }
 
@@ -518,15 +505,6 @@ HOSTDEVICE static void ReadVecDataWithInt64Index(
   } else {
     if (!need_broadcast) {
 #pragma unroll
-      for (int i = 0; i < VecSize; ++i) {
-        CUDA_ASSERT(idx + i >= 0 && idx + i < src_strides[0]);
-      }
-      for (int i = 0; i < rank; ++i) {
-        CUDA_ASSERT(src_strides[i] == dst_strides[i]);
-      }
-      CUDA_ASSERT(src_strides[rank] == 1);
-      CUDA_ASSERT(dst_strides[rank] == 1);
-
       phi::Load<T, VecSize>(in + idx, out);
     } else {
 #pragma unroll
@@ -772,28 +750,6 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
     auto gpu_config =
         phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
 
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << "-------------------------------------------------";
-      VLOG(10) << "Arity = " << Arity << " , VecSize = " << VecSize
-               << " , rank = " << rank << " , Functor = "
-               << phi::enforce::demangle(typeid(Functor).name())
-               << " , InT = " << phi::enforce::demangle(typeid(InT).name())
-               << " , OutT = " << phi::enforce::demangle(typeid(OutT).name());
-      for (int i = 0; i < Arity; ++i) {
-        VLOG(10) << "in " << i << " : dims = " << ins[i]->dims()
-                 << " broadcast_dims = "
-                 << phi::DDim(ins_expand_dims[i].Get(), rank)
-                 << " strides = " << phi::DDim(ins_strides[i].Get(), rank + 1)
-                 << " need_broadcast = " << need_broadcasts[i];
-      }
-      VLOG(10) << "out : dims = " << (*outs)[0]->dims()
-               << " strides = " << phi::DDim(out_strides.Get(), rank + 1);
-      VLOG(10) << "gpu_config.block_per_grid = " << gpu_config.block_per_grid.x
-               << " , gpu_config.thread_per_block = "
-               << gpu_config.thread_per_block.x;
-      VLOG(10) << "-------------------------------------------------";
-    }
-
     BroadcastKernelWithInt64Index<InT, OutT, Functor, VecSize, Arity>
         <<<gpu_config.block_per_grid,
            gpu_config.thread_per_block,
@@ -949,9 +905,7 @@ void BroadcastKernelForDifferentVecSize(
   bool use_int64_index_kernel =
       kEnabledInt64IndexKernel &&
       (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
-  if (!FLAGS_use_int32_kernel) {
-    use_int64_index_kernel = kEnabledInt64IndexKernel;
-  }
+  use_int64_index_kernel = kEnabledInt64IndexKernel;
   if (use_int64_index_kernel) {
     int vec_size = GetVecsize<InT, OutT>(ins, outs);
     switch (vec_size) {

From 86c83b42b501d9e9111e62799340c989524b9be5 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 13 Sep 2022 14:52:00 +0000
Subject: [PATCH 11/12] fix viterbi_decode and set_value op uts

---
 paddle/phi/kernels/funcs/broadcast_function.h   | 14 +++++++-------
 paddle/phi/kernels/gpu/viterbi_decode_kernel.cu |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 46ec31227346b..f29ed4f02aed1 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -504,7 +504,6 @@ HOSTDEVICE static void ReadVecDataWithInt64Index(
     }
   } else {
     if (!need_broadcast) {
-#pragma unroll
       phi::Load<T, VecSize>(in + idx, out);
     } else {
 #pragma unroll
@@ -733,13 +732,14 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
                MaxWithOne<Arity>::kValue>
         ins_strides;
     phi::Array<bool, MaxWithOne<Arity>::kValue> need_broadcasts;
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> out_strides;
+    const auto &out_dims = out_tensor->dims();
+    if (rank <= out_dims.size()) {
+      out_strides = ShapeToStride(out_dims.Get(), rank);
+    } else {
+      out_strides = ShapeToStride(broadcast_out_dims.Get(), rank);
+    }
 
-    PADDLE_ENFORCE_EQ(
-        rank,
-        out_tensor->dims().size(),
-        phi::errors::InvalidArgument(
-            "Output tensor's rank does not match. This may be a bug."));
-    auto out_strides = ShapeToStride(out_tensor->dims().Get(), rank);
     for (int i = 0; i < Arity; ++i) {
       ins_strides[i] = ShapeToStride(ins_expand_dims[i].Get(), rank);
       need_broadcasts[i] =
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index 224651326d762..31227e59433ea 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -92,7 +92,7 @@ struct BinaryOperation {
     std::vector<DenseTensor*> outs{output};
     paddle::operators::
         LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-            dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+            dev_ctx, ins, &outs, 0, BinaryFunctor<T>());
   }
 };
 

From 5cf185430077682fdd66842db7403c181e825a6e Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 14 Sep 2022 01:54:35 +0000
Subject: [PATCH 12/12] remove always enable int64

---
 paddle/phi/kernels/funcs/broadcast_function.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index f29ed4f02aed1..7d9efa46b7a5d 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -905,7 +905,6 @@ void BroadcastKernelForDifferentVecSize(
   bool use_int64_index_kernel =
       kEnabledInt64IndexKernel &&
       (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
-  use_int64_index_kernel = kEnabledInt64IndexKernel;
   if (use_int64_index_kernel) {
     int vec_size = GetVecsize<InT, OutT>(ins, outs);
     switch (vec_size) {