From 55c154360fb8527cb36044a16fb69e067141a0e7 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 5 Sep 2022 06:47:53 +0000 Subject: [PATCH 01/12] support int64 non-broadcast --- paddle/phi/kernels/funcs/broadcast_function.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 9b9d9e1d20e12..40e2e39f024ee 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -580,10 +580,24 @@ void BroadcastKernel(const KPDevice &ctx, Functor func) { std::vector dims_size; dims_size.reserve(ins.size()); + + bool no_broadcast_flag = true; for (auto *in : ins) { + no_broadcast_flag &= (in->dims() == ins[0]->dims()); dims_size.emplace_back(in->dims().size()); } + if (!ins.empty()) { + for (auto *out : *outs) { + no_broadcast_flag &= (out->dims() == ins[0]->dims()); + } + } + + if (no_broadcast_flag) { + phi::funcs::ElementwiseKernel(ctx, ins, outs, func); + return; + } + axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) - *std::min_element(dims_size.begin(), dims_size.end()) : axis; From b7b010005f1a4e3d97b7f046b1781e96d3709a8e Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 7 Sep 2022 01:30:01 +0000 Subject: [PATCH 02/12] support broadcast case for int64 index --- .../platform/device/gpu/cuda/cuda_helper.h | 11 +- .../platform/device/gpu/rocm/rocm_helper.h | 3 +- paddle/phi/backends/gpu/cuda/cuda_helper.h | 11 +- paddle/phi/backends/gpu/rocm/rocm_helper.h | 3 +- paddle/phi/kernels/funcs/broadcast_function.h | 286 ++++++++++++++++++ 5 files changed, 302 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h index d1d33d50a5dbb..fa21a5f096611 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h @@ -70,11 +70,12 @@ namespace platform { * */ -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = \ - static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += blockDim.x * gridDim.x, i = __index__) +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = \ + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ + int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += __stride__, i = __index__) class CublasHandleHolder { public: diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h index 8bcae15d3517e..45eba2b1537c8 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h @@ -70,8 +70,9 @@ namespace platform { #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ int64_t __index__ = \ static_cast(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \ + int64_t __stride__ = static_cast(hipBlockDim_x) * hipGridDim_x; \ for (index_type i = __index__; __index__ < (num); \ - __index__ += hipBlockDim_x * hipGridDim_x, i = __index__) + __index__ += __stride__, i = __index__) class CublasHandleHolder { public: diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h index 6d33d802b1880..7463edc5d9ff6 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_helper.h +++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h @@ -62,11 +62,12 @@ namespace gpu { * */ -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = \ - static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += blockDim.x * gridDim.x, i = __index__) +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = \ + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ + int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += __stride__, i = __index__) } // namespace gpu } // namespace backends diff --git a/paddle/phi/backends/gpu/rocm/rocm_helper.h b/paddle/phi/backends/gpu/rocm/rocm_helper.h index e25dea28e36c1..07fdde5a2f417 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_helper.h +++ b/paddle/phi/backends/gpu/rocm/rocm_helper.h @@ -65,8 +65,9 @@ namespace gpu { #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ int64_t __index__ = \ static_cast(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \ + int64_t __stride__ = static_cast(hipBlockDim_x) * hipGridDim_x; \ for (index_type i = __index__; __index__ < (num); \ - __index__ += hipBlockDim_x * hipGridDim_x, i = __index__) + __index__ += __stride__, i = __index__) } // namespace gpu } // namespace backends diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 40e2e39f024ee..aa2304e38463a 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -224,6 +224,7 @@ struct DimensionsTransform { }; template + int GetVecsize(const std::vector &ins, std::vector *outs) { int in_vec_size = 4; @@ -468,6 +469,233 @@ void LaunchBroadcastKernel( func); } +#ifndef PADDLE_WITH_XPU_KP +HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx( + int64_t src_idx, + const phi::Array &src_strides, + const phi::Array &dst_strides, + int rank) { + int64_t dst_idx = 0; + int64_t old_src_idx = src_idx; + for (int k = 0; k < rank; ++k) { + auto local_idx = src_idx / src_strides[k + 1]; + src_idx -= local_idx * src_strides[k + 1]; + + if (dst_strides[k] != dst_strides[k + 1]) { + dst_idx += local_idx * dst_strides[k + 1]; + } + } + return dst_idx; +} + +template +HOSTDEVICE static void ReadVecDataWithInt64Index( + const T *in, + int64_t idx, + const phi::Array &src_strides, + const phi::Array &dst_strides, + int rank, + phi::AlignedVector *out) { + if (src_strides[0] == dst_strides[0]) { + phi::Load(in + idx, out); + } else { +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + (*out)[i] = + in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)]; + } + } +} + +template +__global__ void BinaryBroadcastKernelWithInt64Index( + const InT *x, + const InT *y, + OutT *z, + phi::Array x_strides, + phi::Array y_strides, + phi::Array z_strides, + int rank, + Functor functor) { + int64_t numel = z_strides[0]; + int64_t idx = + (static_cast(blockIdx.x) * blockDim.x + threadIdx.x) * VecSize; + int64_t stride = static_cast(blockDim.x) * gridDim.x * VecSize; + int64_t limit = numel - VecSize; + + for (; idx <= limit; idx += stride) { + phi::AlignedVector x_vec, y_vec; + phi::AlignedVector z_vec; + ReadVecDataWithInt64Index( + x, idx, z_strides, x_strides, rank, &x_vec); + ReadVecDataWithInt64Index( + y, idx, z_strides, y_strides, rank, &y_vec); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + z_vec[i] = functor(x_vec[i], y_vec[i]); + } + phi::Store(z_vec, z + idx); + } + + for (; idx < numel; ++idx) { + z[idx] = functor(x[idx], y[idx]); + } +} + +template +struct LaunchBroadcastKernelWithInt64IndexHelper { + static void Run(const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { + PADDLE_THROW(phi::errors::PermissionDenied( + "Unreachable code branch. This may be a bug.")); + } +}; + +template +struct LaunchBroadcastKernelWithInt64IndexHelper { + static void Run(const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { + const auto *x = ins[0], *y = ins[1]; + auto *z = (*outs)[0]; + const auto *x_data = x->data(); + const auto *y_data = y->data(); + auto *z_data = ctx.template Alloc(z); + + phi::Array x_out_dims, y_out_dims, + broadcast_out_dims; + int rank; + CalculateBroadcastDims(x->dims(), + y->dims(), + axis, + &x_out_dims, + &y_out_dims, + &broadcast_out_dims, + &rank); + + auto x_strides = ShapeToStride(x_out_dims, rank); + auto y_strides = ShapeToStride(y_out_dims, rank); + auto z_strides = ShapeToStride(broadcast_out_dims, rank); + int64_t numel = z_strides[0]; + auto gpu_config = + phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize); + + BinaryBroadcastKernelWithInt64Index + <<>>(x_data, + y_data, + z_data, + x_strides, + y_strides, + z_strides, + rank, + func); + } + + private: + static void CalculateBroadcastDims( + const phi::DDim &x_dims, + const phi::DDim &y_dims, + int axis, + phi::Array *x_out_dims, + phi::Array *y_out_dims, + phi::Array *broadcast_out_dims, + int *length) { + int nx = x_dims.size(), ny = y_dims.size(); + PADDLE_ENFORCE_GE( + axis, 0, phi::errors::InvalidArgument("Invalid axis value: %d", axis)); + if (nx == ny) { + *length = nx; + for (int i = 0; i < nx; ++i) { + if (x_dims[i] != y_dims[i]) { + PADDLE_ENFORCE_EQ( + x_dims[i] == 1 || y_dims[i] == 1, + true, + phi::errors::InvalidArgument("Cannot broadcast input shape where " + "x_dims[%d] = %d, y_dims[%d] = %d.", + i, + x_dims[i], + i, + y_dims[i])); + } + (*broadcast_out_dims)[i] = std::max(x_dims[i], y_dims[i]); + (*x_out_dims)[i] = x_dims[i]; + (*y_out_dims)[i] = y_dims[i]; + } + } else if (nx > ny) { + *length = nx; + for (int i = nx - axis; i < ny; ++i) { + PADDLE_ENFORCE_EQ( + y_dims[i], + 1, + phi::errors::InvalidArgument( + "The trailing Y.shape[%d] should be 1 but got %d.", + i, + y_dims[i])); + } + + for (int i = 0; i < nx; ++i) { + if (i >= axis && i - axis < ny) { + if (x_dims[i] != y_dims[i - axis]) { + PADDLE_ENFORCE_EQ(x_dims[i] == 1 || y_dims[i - axis] == 1, + true, + phi::errors::InvalidArgument( + "Cannot broadcast input shape where " + "x_dims[%d] = %d, y_dims[%d] = %d.", + i, + x_dims[i], + i - axis, + y_dims[i - axis])); + } + (*broadcast_out_dims)[i] = std::max(x_dims[i], y_dims[i - axis]); + (*x_out_dims)[i] = x_dims[i]; + (*y_out_dims)[i] = y_dims[i - axis]; + } else { + (*broadcast_out_dims)[i] = x_dims[i]; + (*x_out_dims)[i] = x_dims[i]; + (*y_out_dims)[i] = 1; + } + } + } else { + CalculateBroadcastDims(y_dims, + x_dims, + axis, + y_out_dims, + x_out_dims, + broadcast_out_dims, + length); + } + } + + static phi::Array ShapeToStride( + const phi::Array &arr, int rank) { + phi::Array strides; + strides[rank] = 1; + for (int i = rank - 1; i >= 0; --i) { + strides[i] = strides[i + 1] * arr[i]; + } + return strides; + } +}; +#endif + template size(), NumOuts)); +#ifndef PADDLE_WITH_XPU_KP + constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity == 2); + bool use_int64_index_kernel = + kEnabledInt64IndexKernel && + (*outs)[0]->numel() >= std::numeric_limits::max(); + use_int64_index_kernel = kEnabledInt64IndexKernel; + if (use_int64_index_kernel) { + int vec_size = GetVecsize(ins, outs); + switch (vec_size) { + case VecSizeL: { + LaunchBroadcastKernelWithInt64IndexHelper::Run(ctx, + ins, + outs, + axis, + func); + break; + } + case VecSizeM: { + LaunchBroadcastKernelWithInt64IndexHelper::Run(ctx, + ins, + outs, + axis, + func); + break; + } + case VecSizeS: { + LaunchBroadcastKernelWithInt64IndexHelper::Run(ctx, + ins, + outs, + axis, + func); + break; + } + default: { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported vectorized size: %d!", vec_size)); + break; + } + } + return; + } +#endif + // mergedim and get vec_size const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis); phi::Array configs; From c86679abba3bbceb93df65e25efb26553aea7fe7 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 7 Sep 2022 05:01:33 +0000 Subject: [PATCH 03/12] fix bug --- paddle/phi/kernels/funcs/broadcast_function.h | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index aa2304e38463a..40c10100615c7 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -492,11 +492,12 @@ template HOSTDEVICE static void ReadVecDataWithInt64Index( const T *in, int64_t idx, + bool no_broadcast, const phi::Array &src_strides, const phi::Array &dst_strides, int rank, phi::AlignedVector *out) { - if (src_strides[0] == dst_strides[0]) { + if (no_broadcast) { phi::Load(in + idx, out); } else { #pragma unroll @@ -516,6 +517,8 @@ __global__ void BinaryBroadcastKernelWithInt64Index( phi::Array y_strides, phi::Array z_strides, int rank, + bool x_no_broadcast, + bool y_no_broadcast, Functor functor) { int64_t numel = z_strides[0]; int64_t idx = @@ -527,9 +530,9 @@ __global__ void BinaryBroadcastKernelWithInt64Index( phi::AlignedVector x_vec, y_vec; phi::AlignedVector z_vec; ReadVecDataWithInt64Index( - x, idx, z_strides, x_strides, rank, &x_vec); + x, idx, x_no_broadcast, z_strides, x_strides, rank, &x_vec); ReadVecDataWithInt64Index( - y, idx, z_strides, y_strides, rank, &y_vec); + y, idx, y_no_broadcast, z_strides, y_strides, rank, &y_vec); #pragma unroll for (int i = 0; i < VecSize; ++i) { z_vec[i] = functor(x_vec[i], y_vec[i]); @@ -587,6 +590,8 @@ struct LaunchBroadcastKernelWithInt64IndexHelper &x, + const phi::Array &y, + int rank) { + for (int i = 0; i < rank; ++i) { + if (x[i] != y[i]) return false; + } + return true; + } + static phi::Array ShapeToStride( const phi::Array &arr, int rank) { phi::Array strides; From 727229d8f3208c9380bf2922fff06f934bd14fa7 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 8 Sep 2022 05:55:06 +0000 Subject: [PATCH 04/12] support more Arity --- paddle/phi/kernels/funcs/broadcast_function.h | 316 +++++++++++++----- 1 file changed, 228 insertions(+), 88 deletions(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 8245d6596686c..f575915eba929 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -487,62 +487,161 @@ HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx( return dst_idx; } -template +template HOSTDEVICE static void ReadVecDataWithInt64Index( const T *in, int64_t idx, - bool no_broadcast, + bool need_broadcast, const phi::Array &src_strides, const phi::Array &dst_strides, int rank, + int n, phi::AlignedVector *out) { - if (no_broadcast) { - phi::Load(in + idx, out); - } else { -#pragma unroll - for (int i = 0; i < VecSize; ++i) { + if (IsBoundary) { + for (int i = 0; i < n; ++i) { (*out)[i] = in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)]; } + } else { + if (!need_broadcast) { + phi::Load(in + idx, out); + } else { +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + (*out)[i] = + in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)]; + } + } } } +template +struct ApplyFunctorWithInt64IndexHelper { + HOSTDEVICE static OutT Run(const phi::AlignedVector *ins_vec, + Functor functor, + int i); +}; + template -__global__ void BinaryBroadcastKernelWithInt64Index( - const InT *x, - const InT *y, - OutT *z, - phi::Array x_strides, - phi::Array y_strides, - phi::Array z_strides, +struct ApplyFunctorWithInt64IndexHelper { + HOSTDEVICE static OutT Run(const phi::AlignedVector *ins_vec, + Functor functor, + int i) { + return static_cast(functor()); + } +}; + +template +struct ApplyFunctorWithInt64IndexHelper { + HOSTDEVICE static OutT Run(const phi::AlignedVector *ins_vec, + Functor functor, + int i) { + return static_cast(functor(ins_vec[0][i])); + } +}; + +template +struct ApplyFunctorWithInt64IndexHelper { + HOSTDEVICE static OutT Run(const phi::AlignedVector *ins_vec, + Functor functor, + int i) { + return static_cast(functor(ins_vec[0][i], ins_vec[1][i])); + } +}; + +template +struct ApplyFunctorWithInt64IndexHelper { + HOSTDEVICE static OutT Run(const phi::AlignedVector *ins_vec, + Functor functor, + int i) { + return static_cast( + functor(ins_vec[0][i], ins_vec[1][i], ins_vec[2][i])); + } +}; + +template +struct MaxWithOne { + static constexpr auto kValue = (N >= 1 ? N : 1); +}; + +template +__global__ void BroadcastKernelWithInt64Index( + phi::Array::kValue> ins, + OutT *out, + phi::Array, + MaxWithOne::kValue> ins_strides, + phi::Array out_strides, + phi::Array::kValue> need_broadcasts, int rank, - bool x_no_broadcast, - bool y_no_broadcast, Functor functor) { - int64_t numel = z_strides[0]; + int64_t numel = out_strides[0]; int64_t idx = (static_cast(blockIdx.x) * blockDim.x + threadIdx.x) * VecSize; int64_t stride = static_cast(blockDim.x) * gridDim.x * VecSize; int64_t limit = numel - VecSize; + phi::Array, MaxWithOne::kValue> + ins_vec; + phi::AlignedVector out_vec; for (; idx <= limit; idx += stride) { - phi::AlignedVector x_vec, y_vec; - phi::AlignedVector z_vec; - ReadVecDataWithInt64Index( - x, idx, x_no_broadcast, z_strides, x_strides, rank, &x_vec); - ReadVecDataWithInt64Index( - y, idx, y_no_broadcast, z_strides, y_strides, rank, &y_vec); +#pragma unroll + for (int i = 0; i < NumIns; ++i) { + ReadVecDataWithInt64Index(ins[i], + idx, + need_broadcasts[i], + out_strides, + ins_strides[i], + rank, + VecSize, + &ins_vec[i]); + } + #pragma unroll for (int i = 0; i < VecSize; ++i) { - z_vec[i] = functor(x_vec[i], y_vec[i]); + out_vec[i] = ApplyFunctorWithInt64IndexHelper::Run(ins_vec.Get(), + functor, + i); } - phi::Store(z_vec, z + idx); + + phi::Store(out_vec, out + idx); } - for (; idx < numel; ++idx) { - int64_t x_idx = ConvertSrcIdxToDstIdx(idx, z_strides, x_strides, rank); - int64_t y_idx = ConvertSrcIdxToDstIdx(idx, z_strides, y_strides, rank); - z[idx] = functor(x[x_idx], y[y_idx]); + if (idx < numel) { + int remain = numel - idx; // remain is always less than VecSize, therefore + // `int` is enough here +#pragma unroll + for (int i = 0; i < NumIns; ++i) { + ReadVecDataWithInt64Index(ins[i], + idx, + need_broadcasts[i], + out_strides, + ins_strides[i], + rank, + remain, + &ins_vec[i]); + } + + for (int i = 0; i < remain; ++i) { + out[idx] = ApplyFunctorWithInt64IndexHelper::Run(ins_vec.Get(), + functor, + i); + } } } @@ -557,76 +656,117 @@ struct LaunchBroadcastKernelWithInt64IndexHelper { const std::vector &ins, std::vector *outs, int axis, - Functor func) { + Functor functor) { PADDLE_THROW(phi::errors::PermissionDenied( "Unreachable code branch. This may be a bug.")); } }; -template +template struct LaunchBroadcastKernelWithInt64IndexHelper { static void Run(const KPDevice &ctx, const std::vector &ins, std::vector *outs, int axis, - Functor func) { - const auto *x = ins[0], *y = ins[1]; - auto *z = (*outs)[0]; - const auto *x_data = x->data(); - const auto *y_data = y->data(); - auto *z_data = ctx.template Alloc(z); - - phi::Array x_out_dims, y_out_dims, - broadcast_out_dims; + Functor functor) { + phi::Array::kValue> ins_ptrs; + for (int i = 0; i < Arity; ++i) { + ins_ptrs[i] = ins[i]->data(); + } + auto *out_tensor = (*outs)[0]; + auto *out_ptr = ctx.Alloc(out_tensor); + + phi::Array, phi::DDim::kMaxRank> + ins_expand_dims; + phi::Array broadcast_out_dims; int rank; - CalculateBroadcastDims(x->dims(), - y->dims(), - axis, - &x_out_dims, - &y_out_dims, - &broadcast_out_dims, - &rank); - bool x_no_broadcast = IsSame(x_out_dims, broadcast_out_dims, rank); - bool y_no_broadcast = IsSame(y_out_dims, broadcast_out_dims, rank); - - auto x_strides = ShapeToStride(x_out_dims, rank); - auto y_strides = ShapeToStride(y_out_dims, rank); - auto z_strides = ShapeToStride(broadcast_out_dims, rank); - int64_t numel = z_strides[0]; + if (Arity == 1) { + rank = ins[0]->dims().size(); + for (int i = 0; i < rank; ++i) { + broadcast_out_dims[i] = ins[0]->dims()[i]; + } + ins_expand_dims[0] = broadcast_out_dims; + } else if (Arity >= 2) { + phi::Array + ins_expand_dims[MaxWithOne::kValue]; + CalculateBroadcastDims(ins[0]->dims().Get(), + ins[1]->dims().Get(), + ins[0]->dims().size(), + ins[1]->dims().size(), + axis, + ins_expand_dims[0].GetMutable(), + ins_expand_dims[1].GetMutable(), + broadcast_out_dims.GetMutable(), + &rank); + for (int i = 2; i < Arity; ++i) { + auto tmp_dims = broadcast_out_dims; + phi::Array tmp_expand_dims; + int tmp_rank; + PADDLE_ENFORCE_GE(rank, + ins[i]->dims().size(), + phi::errors::InvalidArgument( + "Unsupported reverse broadcast when the input " + "tensor number is larger than 2.")); + CalculateBroadcastDims(tmp_dims.Get(), + ins[i]->dims().Get(), + rank, + ins[i]->dims().size(), + axis, + tmp_expand_dims.GetMutable(), + ins_expand_dims[i].GetMutable(), + broadcast_out_dims.GetMutable(), + &tmp_rank); + PADDLE_ENFORCE_EQ(rank, + tmp_rank, + phi::errors::InvalidArgument( + "Wrong broadcast algorithm. This may be a bug.")); + } + } + + phi::Array, + MaxWithOne::kValue> + ins_strides; + phi::Array::kValue> need_broadcasts; + + auto out_strides = ShapeToStride(broadcast_out_dims.Get(), rank); + for (int i = 0; i < Arity; ++i) { + ins_strides[i] = ShapeToStride(ins_expand_dims[i].Get(), rank); + need_broadcasts[i] = + !IsSameShape(out_strides.Get(), ins_strides[i].Get(), rank + 1); + } + + int64_t numel = out_strides[0]; auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize); - BinaryBroadcastKernelWithInt64Index + BroadcastKernelWithInt64Index <<>>(x_data, - y_data, - z_data, - x_strides, - y_strides, - z_strides, + ctx.stream()>>>(ins_ptrs, + out_ptr, + ins_strides, + out_strides, + need_broadcasts, rank, - x_no_broadcast, - y_no_broadcast, - func); + functor); } private: - static void CalculateBroadcastDims( - const phi::DDim &x_dims, - const phi::DDim &y_dims, - int axis, - phi::Array *x_out_dims, - phi::Array *y_out_dims, - phi::Array *broadcast_out_dims, - int *length) { - int nx = x_dims.size(), ny = y_dims.size(); + static void CalculateBroadcastDims(const int64_t *x_dims, + const int64_t *y_dims, + int nx, + int ny, + int axis, + int64_t *x_out_dims, + int64_t *y_out_dims, + int64_t *broadcast_out_dims, + int *length) { PADDLE_ENFORCE_GE( axis, 0, phi::errors::InvalidArgument("Invalid axis value: %d", axis)); if (nx == ny) { @@ -643,9 +783,9 @@ struct LaunchBroadcastKernelWithInt64IndexHelper ny) { *length = nx; @@ -672,18 +812,20 @@ struct LaunchBroadcastKernelWithInt64IndexHelper &x, - const phi::Array &y, - int rank) { + static bool IsSameShape(const int64_t *x, const int64_t *y, int rank) { for (int i = 0; i < rank; ++i) { if (x[i] != y[i]) return false; } @@ -702,7 +842,7 @@ struct LaunchBroadcastKernelWithInt64IndexHelper ShapeToStride( - const phi::Array &arr, int rank) { + const int64_t *arr, int rank) { phi::Array strides; strides[rank] = 1; for (int i = rank - 1; i >= 0; --i) { @@ -755,7 +895,7 @@ void BroadcastKernelForDifferentVecSize( NumOuts)); #ifndef PADDLE_WITH_XPU_KP - constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity == 2); + constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity <= 3); bool use_int64_index_kernel = kEnabledInt64IndexKernel && (*outs)[0]->numel() >= std::numeric_limits::max(); From c081861917045b0ddea880aa04fe0796b9144aae Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 8 Sep 2022 06:13:36 +0000 Subject: [PATCH 05/12] remove some codes --- paddle/phi/kernels/funcs/broadcast_function.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index f575915eba929..683806c1e5f47 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -1023,24 +1023,10 @@ void BroadcastKernel(const KPDevice &ctx, Functor func) { std::vector dims_size; dims_size.reserve(ins.size()); - - bool no_broadcast_flag = true; for (auto *in : ins) { - no_broadcast_flag &= (in->dims() == ins[0]->dims()); dims_size.emplace_back(in->dims().size()); } - if (!ins.empty()) { - for (auto *out : *outs) { - no_broadcast_flag &= (out->dims() == ins[0]->dims()); - } - } - - if (no_broadcast_flag) { - phi::funcs::ElementwiseKernel(ctx, ins, outs, func); - return; - } - axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) - *std::min_element(dims_size.begin(), dims_size.end()) : axis; From 4535d32422186f6d289a78e3eae5882ca499657f Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 8 Sep 2022 07:09:37 +0000 Subject: [PATCH 06/12] upgrade patchelf to v0.15.0 to pass CI build --- tools/dockerfile/Dockerfile.release16 | 13 ++++---- tools/dockerfile/Dockerfile.release18 | 12 +++----- tools/dockerfile/Dockerfile.ubuntu | 14 ++++----- tools/dockerfile/Dockerfile.ubuntu18 | 12 +++----- tools/dockerfile/build_scripts/build.sh | 2 +- .../build_scripts/install_patchelf.sh | 30 +++++++++++++++++++ 6 files changed, 51 insertions(+), 32 deletions(-) create mode 100644 tools/dockerfile/build_scripts/install_patchelf.sh diff --git a/tools/dockerfile/Dockerfile.release16 b/tools/dockerfile/Dockerfile.release16 index 66974f46d91e4..49637a3fee9a4 100644 --- a/tools/dockerfile/Dockerfile.release16 +++ b/tools/dockerfile/Dockerfile.release16 @@ -101,6 +101,12 @@ RUN curl -s -q https://glide.sh/get | sh # Downgrade TensorRT COPY tools/dockerfile/build_scripts /build_scripts RUN bash /build_scripts/install_nccl2.sh + +# Older versions of patchelf limited the size of the files being processed and were fixed in this pr. +# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa +# # So install a newer version here. +RUN bash /build_scripts/install_patchelf.sh + RUN rm -rf /build_scripts # git credential to skip password typing @@ -143,13 +149,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/ RUN apt-get install libprotobuf-dev -y -# Older versions of patchelf limited the size of the files being processed and were fixed in this pr. -# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa -# So install a newer version here. -RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \ - dpkg -i patchelf_0.10-2_amd64.deb && \ - rm -rf patchelf_0.10-2_amd64.deb - # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config CMD source ~/.bashrc diff --git a/tools/dockerfile/Dockerfile.release18 b/tools/dockerfile/Dockerfile.release18 index d646f41b00d0b..cf343873d943a 100644 --- a/tools/dockerfile/Dockerfile.release18 +++ b/tools/dockerfile/Dockerfile.release18 @@ -28,6 +28,10 @@ RUN apt-get update && \ # Downgrade gcc&&g++ WORKDIR /usr/bin COPY tools/dockerfile/build_scripts /build_scripts +# Older versions of patchelf limited the size of the files being processed and were fixed in this pr. +# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa +# # So install a newer version here. +RUN bash /build_scripts/install_patchelf.sh RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc @@ -99,14 +103,6 @@ RUN pip3.7 --no-cache-dir install pylint pytest astroid isort COPY ./python/requirements.txt /root/ RUN pip3.7 --no-cache-dir install -r /root/requirements.txt - -# Older versions of patchelf limited the size of the files being processed and were fixed in this pr. -# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa -# So install a newer version here. -RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \ - dpkg -i patchelf_0.10-2_amd64.deb && \ - rm -rf patchelf_0.10-2_amd64.deb - # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config #CMD source ~/.bashrc diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu index 7e0c3a62b1d50..7c9636d57ddf5 100644 --- a/tools/dockerfile/Dockerfile.ubuntu +++ b/tools/dockerfile/Dockerfile.ubuntu @@ -143,9 +143,14 @@ RUN curl -s -q https://glide.sh/get | sh # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. # Downgrade TensorRT + +# Older versions of patchelf limited the size of the files being processed and were fixed in this pr. +# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa +# # So install a newer version here. COPY tools/dockerfile/build_scripts /build_scripts RUN bash /build_scripts/install_trt.sh && \ - bash /build_scripts/install_nccl2.sh + bash /build_scripts/install_nccl2.sh && \ + bash /build_scripts/install_patchelf.sh RUN rm -rf /build_scripts # git credential to skip password typing @@ -236,13 +241,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/ RUN apt-get install libprotobuf-dev -y -# Older versions of patchelf limited the size of the files being processed and were fixed in this pr. -# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa -# So install a newer version here. -RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \ - dpkg -i patchelf_0.10-2_amd64.deb && \ - rm -rf patchelf_0.10-2_amd64.deb - # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config CMD source ~/.bashrc diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18 index a5dba053b98b2..8ebfd9b8371c2 100644 --- a/tools/dockerfile/Dockerfile.ubuntu18 +++ b/tools/dockerfile/Dockerfile.ubuntu18 @@ -35,6 +35,10 @@ RUN apt-get update --allow-unauthenticated && \ WORKDIR /usr/bin COPY tools/dockerfile/build_scripts /build_scripts RUN bash /build_scripts/install_trt.sh +# Older versions of patchelf limited the size of the files being processed and were fixed in this pr. +# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa +# # So install a newer version here. +RUN bash /build_scripts/install_patchelf.sh RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc @@ -151,14 +155,6 @@ RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \ pip3.8 --no-cache-dir install -r /root/requirements.txt && \ pip3.9 --no-cache-dir install -r /root/requirements.txt - -# Older versions of patchelf limited the size of the files being processed and were fixed in this pr. -# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa -# So install a newer version here. -RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \ - dpkg -i patchelf_0.10-2_amd64.deb && \ - rm -rf patchelf_0.10-2_amd64.deb - # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config #CMD source ~/.bashrc diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh index 92d1c12d2bc41..61bcc1f103563 100644 --- a/tools/dockerfile/build_scripts/build.sh +++ b/tools/dockerfile/build_scripts/build.sh @@ -106,7 +106,7 @@ export SSL_CERT_FILE=/opt/_internal/certs.pem # tar -xzf patchelf-0.9njs2.tar.gz # (cd patchelf-0.9njs2 && ./configure && make && make install) # rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2 -yum install -y patchelf +sh "$MY_DIR/install_patchelf.sh" # Install latest pypi release of auditwheel #LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel diff --git a/tools/dockerfile/build_scripts/install_patchelf.sh b/tools/dockerfile/build_scripts/install_patchelf.sh new file mode 100644 index 0000000000000..4de9c6352f2ea --- /dev/null +++ b/tools/dockerfile/build_scripts/install_patchelf.sh @@ -0,0 +1,30 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +TMP_DIR=patchelf_tmp + +rm -rf "$TMP_DIR" +git clone -b 0.15.0 https://github.com/NixOS/patchelf "$TMP_DIR" + +cd "$TMP_DIR" +./bootstrap.sh +./configure +make +make check +make install + +cd .. +rm -rf "$TMP_DIR" From d0d3981b711ee842f402f6b438f54f797d38f11e Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 8 Sep 2022 08:09:57 +0000 Subject: [PATCH 07/12] fix bug --- paddle/phi/kernels/funcs/broadcast_function.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 683806c1e5f47..624be323cd57c 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -681,7 +681,8 @@ struct LaunchBroadcastKernelWithInt64IndexHelper(out_tensor); - phi::Array, phi::DDim::kMaxRank> + phi::Array, + MaxWithOne::kValue> ins_expand_dims; phi::Array broadcast_out_dims; int rank; @@ -692,8 +693,6 @@ struct LaunchBroadcastKernelWithInt64IndexHelper= 2) { - phi::Array - ins_expand_dims[MaxWithOne::kValue]; CalculateBroadcastDims(ins[0]->dims().Get(), ins[1]->dims().Get(), ins[0]->dims().size(), From 914642d67d28d4afb5a9fe53fe4dc5f0887abd79 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 9 Sep 2022 01:47:19 +0000 Subject: [PATCH 08/12] fix patchelf installation --- tools/dockerfile/build_scripts/install_patchelf.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/dockerfile/build_scripts/install_patchelf.sh b/tools/dockerfile/build_scripts/install_patchelf.sh index 4de9c6352f2ea..9fda46e5b6f86 100644 --- a/tools/dockerfile/build_scripts/install_patchelf.sh +++ b/tools/dockerfile/build_scripts/install_patchelf.sh @@ -23,7 +23,6 @@ cd "$TMP_DIR" ./bootstrap.sh ./configure make -make check make install cd .. From 1f018cbc595de6578b7763bd9db6c79a1029c9a6 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 13 Sep 2022 07:54:09 +0000 Subject: [PATCH 09/12] add debug flags --- paddle/fluid/platform/flags.cc | 2 + paddle/phi/kernels/funcs/broadcast_function.h | 71 ++++++++++++++++--- 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 28dddc1fbebdd..2f2eae6add537 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -1023,3 +1023,5 @@ PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_string(jit_engine_type, "PE", "Choose default funciton type in JitLayer."); + +PADDLE_DEFINE_EXPORTED_bool(use_int32_kernel, false, ""); diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 624be323cd57c..eb4a332887e9c 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "gflags/gflags.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) @@ -22,6 +23,8 @@ namespace kps = phi::kps; #endif +DECLARE_bool(use_int32_kernel); + namespace phi { namespace funcs { @@ -469,6 +472,14 @@ void LaunchBroadcastKernel( } #ifndef PADDLE_WITH_XPU_KP +#define CUDA_ASSERT(__cond) \ + do { \ + if (!(__cond)) { \ + printf(#__cond); \ + asm("trap;"); \ + } \ + } while (0) + HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx( int64_t src_idx, const phi::Array &src_strides, @@ -484,6 +495,8 @@ HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx( dst_idx += local_idx * dst_strides[k + 1]; } } + CUDA_ASSERT(src_idx >= 0 && src_idx < src_strides[0]); + CUDA_ASSERT(dst_idx >= 0 && dst_idx < dst_strides[0]); return dst_idx; } @@ -504,6 +517,16 @@ HOSTDEVICE static void ReadVecDataWithInt64Index( } } else { if (!need_broadcast) { +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + CUDA_ASSERT(idx + i >= 0 && idx + i < src_strides[0]); + } + for (int i = 0; i < rank; ++i) { + CUDA_ASSERT(src_strides[i] == dst_strides[i]); + } + CUDA_ASSERT(src_strides[rank] == 1); + CUDA_ASSERT(dst_strides[rank] == 1); + phi::Load(in + idx, out); } else { #pragma unroll @@ -634,13 +657,14 @@ __global__ void BroadcastKernelWithInt64Index( } for (int i = 0; i < remain; ++i) { - out[idx] = ApplyFunctorWithInt64IndexHelper::Run(ins_vec.Get(), - functor, - i); + out[idx + i] = + ApplyFunctorWithInt64IndexHelper::Run(ins_vec.Get(), + functor, + i); } } } @@ -732,7 +756,12 @@ struct LaunchBroadcastKernelWithInt64IndexHelper::kValue> need_broadcasts; - auto out_strides = ShapeToStride(broadcast_out_dims.Get(), rank); + PADDLE_ENFORCE_EQ( + rank, + out_tensor->dims().size(), + phi::errors::InvalidArgument( + "Output tensor's rank does not match. This may be a bug.")); + auto out_strides = ShapeToStride(out_tensor->dims().Get(), rank); for (int i = 0; i < Arity; ++i) { ins_strides[i] = ShapeToStride(ins_expand_dims[i].Get(), rank); need_broadcasts[i] = @@ -743,6 +772,28 @@ struct LaunchBroadcastKernelWithInt64IndexHelper <<numel() >= std::numeric_limits::max(); - use_int64_index_kernel = kEnabledInt64IndexKernel; + if (!FLAGS_use_int32_kernel) { + use_int64_index_kernel = kEnabledInt64IndexKernel; + } if (use_int64_index_kernel) { int vec_size = GetVecsize(ins, outs); switch (vec_size) { From 0358f919ca8eb2145116667e922b8e00923b68a3 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 13 Sep 2022 07:55:03 +0000 Subject: [PATCH 10/12] remove useless codes --- paddle/fluid/platform/flags.cc | 2 - paddle/phi/kernels/funcs/broadcast_function.h | 48 +------------------ 2 files changed, 1 insertion(+), 49 deletions(-) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 2f2eae6add537..28dddc1fbebdd 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -1023,5 +1023,3 @@ PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_string(jit_engine_type, "PE", "Choose default funciton type in JitLayer."); - -PADDLE_DEFINE_EXPORTED_bool(use_int32_kernel, false, ""); diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index eb4a332887e9c..46ec31227346b 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include "gflags/gflags.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) @@ -23,8 +22,6 @@ namespace kps = phi::kps; #endif -DECLARE_bool(use_int32_kernel); - namespace phi { namespace funcs { @@ -472,14 +469,6 @@ void LaunchBroadcastKernel( } #ifndef PADDLE_WITH_XPU_KP -#define CUDA_ASSERT(__cond) \ - do { \ - if (!(__cond)) { \ - printf(#__cond); \ - asm("trap;"); \ - } \ - } while (0) - HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx( int64_t src_idx, const phi::Array &src_strides, @@ -495,8 +484,6 @@ HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx( dst_idx += local_idx * dst_strides[k + 1]; } } - CUDA_ASSERT(src_idx >= 0 && src_idx < src_strides[0]); - CUDA_ASSERT(dst_idx >= 0 && dst_idx < dst_strides[0]); return dst_idx; } @@ -518,15 +505,6 @@ HOSTDEVICE static void ReadVecDataWithInt64Index( } else { if (!need_broadcast) { #pragma unroll - for (int i = 0; i < VecSize; ++i) { - CUDA_ASSERT(idx + i >= 0 && idx + i < src_strides[0]); - } - for (int i = 0; i < rank; ++i) { - CUDA_ASSERT(src_strides[i] == dst_strides[i]); - } - CUDA_ASSERT(src_strides[rank] == 1); - CUDA_ASSERT(dst_strides[rank] == 1); - phi::Load(in + idx, out); } else { #pragma unroll @@ -772,28 +750,6 @@ struct LaunchBroadcastKernelWithInt64IndexHelper <<numel() >= std::numeric_limits::max(); - if (!FLAGS_use_int32_kernel) { - use_int64_index_kernel = kEnabledInt64IndexKernel; - } + use_int64_index_kernel = kEnabledInt64IndexKernel; if (use_int64_index_kernel) { int vec_size = GetVecsize(ins, outs); switch (vec_size) { From 86c83b42b501d9e9111e62799340c989524b9be5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 13 Sep 2022 14:52:00 +0000 Subject: [PATCH 11/12] fix viterbi_decode and set_value op uts --- paddle/phi/kernels/funcs/broadcast_function.h | 14 +++++++------- paddle/phi/kernels/gpu/viterbi_decode_kernel.cu | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 46ec31227346b..f29ed4f02aed1 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -504,7 +504,6 @@ HOSTDEVICE static void ReadVecDataWithInt64Index( } } else { if (!need_broadcast) { -#pragma unroll phi::Load(in + idx, out); } else { #pragma unroll @@ -733,13 +732,14 @@ struct LaunchBroadcastKernelWithInt64IndexHelper::kValue> ins_strides; phi::Array::kValue> need_broadcasts; + phi::Array out_strides; + const auto &out_dims = out_tensor->dims(); + if (rank <= out_dims.size()) { + out_strides = ShapeToStride(out_dims.Get(), rank); + } else { + out_strides = ShapeToStride(broadcast_out_dims.Get(), rank); + } - PADDLE_ENFORCE_EQ( - rank, - out_tensor->dims().size(), - phi::errors::InvalidArgument( - "Output tensor's rank does not match. This may be a bug.")); - auto out_strides = ShapeToStride(out_tensor->dims().Get(), rank); for (int i = 0; i < Arity; ++i) { ins_strides[i] = ShapeToStride(ins_expand_dims[i].Get(), rank); need_broadcasts[i] = diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu index 224651326d762..31227e59433ea 100644 --- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu +++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu @@ -92,7 +92,7 @@ struct BinaryOperation { std::vector outs{output}; paddle::operators:: LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, -1, BinaryFunctor()); + dev_ctx, ins, &outs, 0, BinaryFunctor()); } }; From 5cf185430077682fdd66842db7403c181e825a6e Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 14 Sep 2022 01:54:35 +0000 Subject: [PATCH 12/12] remove always enable int64 --- paddle/phi/kernels/funcs/broadcast_function.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index f29ed4f02aed1..7d9efa46b7a5d 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -905,7 +905,6 @@ void BroadcastKernelForDifferentVecSize( bool use_int64_index_kernel = kEnabledInt64IndexKernel && (*outs)[0]->numel() >= std::numeric_limits::max(); - use_int64_index_kernel = kEnabledInt64IndexKernel; if (use_int64_index_kernel) { int vec_size = GetVecsize(ins, outs); switch (vec_size) {