Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Cherry-pick][Release/2.4] support pure bfloat16 for more ops #47177

Merged
merged 1 commit into from
Oct 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/scope_guard.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/dynload/cublasLt.h"
#include "paddle/fluid/platform/float16.h"

Expand Down Expand Up @@ -62,6 +63,9 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
if (std::is_same<T, paddle::platform::float16>::value) {
mat_type = CUDA_R_16F;
}
if (std::is_same<T, platform::bfloat16>::value) {
mat_type = CUDA_R_16BF;
}
if (std::is_same<T, double>::value) {
mat_type = CUDA_R_64F;
scale_type = CUDA_R_64F;
Expand Down Expand Up @@ -352,6 +356,9 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
if (std::is_same<T, paddle::platform::float16>::value) {
mat_type = CUDA_R_16F;
}
if (std::is_same<T, platform::bfloat16>::value) {
mat_type = CUDA_R_16BF;
}
if (std::is_same<T, double>::value) {
mat_type = CUDA_R_64F;
scale_type = CUDA_R_64F;
Expand Down Expand Up @@ -686,12 +693,14 @@ REGISTER_OP_CUDA_KERNEL(
fused_gemm_epilogue,
ops::FusedGemmEpilogueKernel<phi::GPUContext, float>,
ops::FusedGemmEpilogueKernel<phi::GPUContext, double>,
ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::float16>);
ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::float16>,
ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::bfloat16>);

REGISTER_OP_CUDA_KERNEL(
fused_gemm_epilogue_grad,
ops::FusedGemmEpilogueGradKernel<phi::GPUContext, float>,
ops::FusedGemmEpilogueGradKernel<phi::GPUContext, double>,
ops::FusedGemmEpilogueGradKernel<phi::GPUContext,
paddle::platform::float16>);
paddle::platform::float16>,
ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::bfloat16>);
#endif
111 changes: 56 additions & 55 deletions paddle/fluid/platform/device/gpu/gpu_primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,61 +198,6 @@ __device__ __forceinline__ void fastAtomicAdd(T *arr,
T value) {
CudaAtomicAdd(arr + index, value);
}

#ifdef PADDLE_WITH_CUDA
/*
* One thead block deals with elementwise atomicAdd for vector of len.
* @in: [x1, x2, x3, ...]
* @out:[y1+x1, y2+x2, y3+x3, ...]
* */
template <typename T,
typename std::enable_if<
!std::is_same<platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
for (int i = tid; i < len; i += threads_per_block) {
CudaAtomicAdd(&out[i], in[i]);
}
}

// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
template <typename T,
typename std::enable_if<
std::is_same<platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
#if ((CUDA_VERSION < 10000) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
for (int i = tid; i < len; i += threads_per_block) {
CudaAtomicAdd(&out[i], in[i]);
}
#else
int i = 0;
int loops = len / 2 * 2;

bool aligned_half2 =
(reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);

if (aligned_half2) {
for (i = tid * 2; i < loops; i += threads_per_block * 2) {
__half2 value2;
T value_1 = in[i];
T value_2 = in[i + 1];
value2.x = *reinterpret_cast<__half *>(&value_1);
value2.y = *reinterpret_cast<__half *>(&value_2);
atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
}
for (; i < len; i += threads_per_block) {
fastAtomicAdd(out, i, len, in[i]);
}
} else {
for (int i = tid; i < len; i += threads_per_block) {
fastAtomicAdd(out, i, len, in[i]);
}
}
#endif
}
#endif
#endif

// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
Expand Down Expand Up @@ -601,5 +546,61 @@ CUDA_ATOMIC_WRAPPER(Min, float16) {
}
#endif

#ifdef PADDLE_CUDA_FP16
#ifdef PADDLE_WITH_CUDA
/*
* One thead block deals with elementwise atomicAdd for vector of len.
* @in: [x1, x2, x3, ...]
* @out:[y1+x1, y2+x2, y3+x3, ...]
* */
template <typename T,
typename std::enable_if<
!std::is_same<platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
for (int i = tid; i < len; i += threads_per_block) {
CudaAtomicAdd(&out[i], in[i]);
}
}

// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
template <typename T,
typename std::enable_if<
std::is_same<platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
#if ((CUDA_VERSION < 10000) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
for (int i = tid; i < len; i += threads_per_block) {
CudaAtomicAdd(&out[i], in[i]);
}
#else
int i = 0;
int loops = len / 2 * 2;

bool aligned_half2 =
(reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);

if (aligned_half2) {
for (i = tid * 2; i < loops; i += threads_per_block * 2) {
__half2 value2;
T value_1 = in[i];
T value_2 = in[i + 1];
value2.x = *reinterpret_cast<__half *>(&value_1);
value2.y = *reinterpret_cast<__half *>(&value_2);
atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
}
for (; i < len; i += threads_per_block) {
fastAtomicAdd(out, i, len, in[i]);
}
} else {
for (int i = tid; i < len; i += threads_per_block) {
fastAtomicAdd(out, i, len, in[i]);
}
}
#endif
}
#endif
#endif
} // namespace platform
} // namespace paddle
1 change: 1 addition & 0 deletions paddle/phi/kernels/empty_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ PD_REGISTER_KERNEL(empty,
int64_t,
bool,
phi::dtype::float16,
phi::dtype::bfloat16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}

Expand Down
8 changes: 5 additions & 3 deletions paddle/phi/kernels/funcs/activation_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -2169,12 +2169,14 @@ struct CudaSeluFunctor : public BaseActivationFunctor<T> {
}

__device__ __forceinline__ T operator()(const T x) const {
T res = x;
if (res <= zero) {
using MT =
typename std::conditional<(sizeof(T) > sizeof(float)), T, float>::type;
MT res = static_cast<MT>(x);
if (x <= zero) {
res = alpha * expf(res) - alpha;
}
res *= scale;
return res;
return static_cast<T>(res);
}

private:
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/kernels/funcs/eigen/broadcast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ INSTANTIATION(EigenBroadcast, int);
INSTANTIATION(EigenBroadcast, int64_t);
INSTANTIATION(EigenBroadcastGrad, bool);
INSTANTIATION(EigenBroadcastGrad, float);
INSTANTIATION(EigenBroadcastGrad, dtype::bfloat16);
INSTANTIATION(EigenBroadcastGrad, dtype::float16);
INSTANTIATION(EigenBroadcastGrad, double);
INSTANTIATION(EigenBroadcastGrad, dtype::complex<float>);
Expand Down
3 changes: 2 additions & 1 deletion paddle/phi/kernels/gpu/activation_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -449,4 +449,5 @@ PD_REGISTER_KERNEL(pow_grad,
double,
int,
int64_t,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
11 changes: 9 additions & 2 deletions paddle/phi/kernels/gpu/activation_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -265,5 +265,12 @@ PD_REGISTER_KERNEL(pow,
double,
int,
int64_t,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(selu, GPU, ALL_LAYOUT, phi::SeluKernel, float, double) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(selu,
GPU,
ALL_LAYOUT,
phi::SeluKernel,
float,
double,
phi::dtype::bfloat16) {}
6 changes: 4 additions & 2 deletions paddle/phi/kernels/gpu/adam_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ PD_REGISTER_KERNEL(adam,
phi::AdamDenseKernel,
float,
double,
phi::dtype::float16) {
phi::dtype::float16,
phi::dtype::bfloat16) {
// Skip beta1_pow, beta2_pow, skip_update data transform
kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
Expand All @@ -386,7 +387,8 @@ PD_REGISTER_KERNEL(merged_adam,
phi::MergedAdamKernel,
float,
double,
phi::dtype::float16) {
phi::dtype::float16,
phi::dtype::bfloat16) {
// Skip beta1_pow, beta2_pow data transform
kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/kernels/gpu/clip_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(clip_grad,
double,
int,
int64_t,
phi::dtype::bfloat16,
phi::dtype::float16) {}
3 changes: 2 additions & 1 deletion paddle/phi/kernels/gpu/clip_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(clip,
double,
int,
int64_t,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
6 changes: 4 additions & 2 deletions paddle/phi/kernels/gpu/embedding_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -249,12 +249,14 @@ PD_REGISTER_KERNEL(embedding_grad,
phi::EmbeddingGradKernel,
float,
double,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}

PD_REGISTER_KERNEL(embedding_sparse_grad,
GPU,
ALL_LAYOUT,
phi::EmbeddingSparseGradKernel,
float,
double,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
3 changes: 2 additions & 1 deletion paddle/phi/kernels/gpu/embedding_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,5 @@ PD_REGISTER_KERNEL(embedding,
phi::EmbeddingKernel,
float,
double,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
3 changes: 2 additions & 1 deletion paddle/phi/kernels/gpu/gelu_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,5 @@ PD_REGISTER_KERNEL(gelu_grad,
phi::GeluGradKernel,
float,
double,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
3 changes: 2 additions & 1 deletion paddle/phi/kernels/gpu/gelu_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,5 @@ PD_REGISTER_KERNEL(gelu,
phi::GeluKernel,
float,
double,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
3 changes: 2 additions & 1 deletion paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -509,4 +509,5 @@ PD_REGISTER_KERNEL(pad3d_grad,
phi::Pad3dGradKernel,
float,
double,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
1 change: 1 addition & 0 deletions paddle/phi/kernels/gpu/pad3d_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ PD_REGISTER_KERNEL(pad3d,
ALL_LAYOUT,
phi::Pad3dKernel,
phi::dtype::float16,
phi::dtype::bfloat16,
float,
double,
int,
Expand Down
4 changes: 3 additions & 1 deletion paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(pixel_shuffle_grad,
ALL_LAYOUT,
phi::PixelShuffleGradKernel,
float,
double) {}
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
10 changes: 8 additions & 2 deletions paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,11 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"

PD_REGISTER_KERNEL(
pixel_shuffle, GPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
PD_REGISTER_KERNEL(pixel_shuffle,
GPU,
ALL_LAYOUT,
phi::PixelShuffleKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
9 changes: 7 additions & 2 deletions paddle/phi/kernels/gpu/selu_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,10 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"

PD_REGISTER_KERNEL(
selu_grad, GPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {}
PD_REGISTER_KERNEL(selu_grad,
GPU,
ALL_LAYOUT,
phi::SeluGradKernel,
float,
double,
phi::dtype::bfloat16) {}
3 changes: 2 additions & 1 deletion paddle/phi/kernels/gpu/tile_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(tile_grad,
double,
int,
int64_t,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
6 changes: 4 additions & 2 deletions paddle/phi/kernels/gpu/where_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ __global__ void WhereGradCUDAKernel(
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
if (dx != nullptr) {
dx[idx] = cond[idx] ? dout[idx] : 0.;
dx[idx] = cond[idx] ? dout[idx] : static_cast<T>(0.);
}
if (dy != nullptr) {
dy[idx] = cond[idx] ? 0. : dout[idx];
dy[idx] = cond[idx] ? static_cast<T>(0.) : dout[idx];
}
}
}
Expand Down Expand Up @@ -61,6 +61,8 @@ PD_REGISTER_KERNEL(where_grad,
GPU,
ALL_LAYOUT,
phi::WhereGradKernel,
phi::dtype::float16,
phi::dtype::bfloat16,
float,
double,
int,
Expand Down
12 changes: 10 additions & 2 deletions paddle/phi/kernels/gpu/where_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,13 @@ void WhereKernel(const Context& ctx,

} // namespace phi

PD_REGISTER_KERNEL(
where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(where,
GPU,
ALL_LAYOUT,
phi::WhereKernel,
float,
double,
int,
int64_t,
phi::dtype::float16,
phi::dtype::bfloat16) {}
Loading