Skip to content

Commit

Permalink
move bernoulli kernel to pten
Browse files Browse the repository at this point in the history
  • Loading branch information
zhiqiu committed Feb 15, 2022
1 parent 3e7825f commit 6b48fdc
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 112 deletions.
28 changes: 0 additions & 28 deletions paddle/fluid/operators/bernoulli_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,30 +49,6 @@ class BernoulliOp : public framework::OperatorWithKernel {
}
};

// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
template <typename T>
class BernoulliOpKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto x = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
auto *in_data = x->data<T>();
auto *out_data = out->mutable_data<T>(ctx.GetPlace());

int64_t size = x->numel();
std::uniform_real_distribution<T> dist(0.0, 1.0);
auto gen_ptr = framework::DefaultCPUGenerator();
auto engine = gen_ptr->GetCPUEngine();

for (int64_t i = 0; i < size; ++i) {
out_data[i] = BernoulliFunctor(in_data[i], dist(*engine));
}
}
}; // namespace operators

} // namespace operators
} // namespace paddle

Expand All @@ -82,7 +58,3 @@ REGISTER_OPERATOR(
bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

REGISTER_OP_CPU_KERNEL(bernoulli,
ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);
84 changes: 0 additions & 84 deletions paddle/fluid/operators/bernoulli_op.cu

This file was deleted.

47 changes: 47 additions & 0 deletions paddle/fluid/platform/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,53 @@ struct Transform<platform::CUDADeviceContext> {
#endif
}
};

template <>
struct Transform<pten::GPUContext> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const pten::GPUContext& context, InputIter first,
InputIter last, OutputIter result, UnaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op);
#endif
}

template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const pten::GPUContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op);
#endif
}
};
#endif

} // namespace platform
Expand Down
40 changes: 40 additions & 0 deletions paddle/pten/kernels/bernoulli_kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/device_context.h"

namespace pten {

template <typename T>
inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
PADDLE_ENFORCE_LE(p,
1.0,
pten::errors::OutOfRange(
"The probability should be <= 1, but got %f", p));
PADDLE_ENFORCE_GE(p,
0.0,
pten::errors::OutOfRange(
"The probability should be >= 0, but got %f", p));
return static_cast<T>(rand < p);
}

template <typename T, typename Context>
void BernoulliKernel(const Context& ctx,
const DenseTensor& x,
DenseTensor* out);

} // namespace pten
42 changes: 42 additions & 0 deletions paddle/pten/kernels/cpu/bernoulli_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/pten/kernels/bernoulli_kernel.h"
#include <random>
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/core/kernel_registry.h"

namespace pten {

template <typename T, typename Context>
void BernoulliKernel(const Context& ctx,
const DenseTensor& x,
DenseTensor* out) {
auto numel = x.numel();
auto* x_data = x.data<T>();
T* out_data = ctx.template Alloc<T>(out);

std::uniform_real_distribution<T> dist(0.0, 1.0);
auto gen_ptr = ctx.GetGenerator();
auto engine = gen_ptr->GetCPUEngine();

for (int64_t i = 0; i < numel; ++i) {
out_data[i] = BernoulliFunctor(x_data[i], dist(*engine));
}
}

} // namespace pten

PT_REGISTER_KERNEL(
bernoulli, CPU, ALL_LAYOUT, pten::BernoulliKernel, float, double) {}
77 changes: 77 additions & 0 deletions paddle/pten/kernels/gpu/bernoulli_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <thrust/execution_policy.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include <algorithm>
#include <vector>
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/kernels/bernoulli_kernel.h"

// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/transform.h"

namespace pten {

template <typename T>
struct BernoulliCudaFunctor {
unsigned int seed_;
unsigned int offset_;
__host__ __device__ BernoulliCudaFunctor(unsigned int seed,
unsigned int offset)
: seed_(seed), offset_(offset) {}

__host__ __device__ T operator()(const unsigned int n, const T p) const {
// NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
// lines of error messages if, and it should be refined.
PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
"The probability should be >=0 and <= 1, but got %f",
p);
thrust::minstd_rand rng;
rng.seed(seed_);
thrust::uniform_real_distribution<T> dist(0.0, 1.0);
rng.discard(n + offset_);
return static_cast<T>(dist(rng) < p);
}
};

template <typename T, typename Context>
void BernoulliKernel(const Context& ctx,
const DenseTensor& x,
DenseTensor* out) {
auto numel = x.numel();
auto* x_data = x.data<T>();
T* out_data = ctx.template Alloc<T>(out);

auto gen_cuda = ctx.GetGenerator();
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = numel * seed_offset.second;
paddle::platform::Transform<pten::GPUContext> trans;
thrust::counting_iterator<int64_t> index_sequence_begin(0);
trans(ctx,
index_sequence_begin,
index_sequence_begin + numel,
x_data,
out_data,
BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
static_cast<int64_t>(gen_offset)));
}

} // namespace pten

PT_REGISTER_KERNEL(
bernoulli, GPU, ALL_LAYOUT, pten::BernoulliKernel, float, double) {}

0 comments on commit 6b48fdc

Please sign in to comment.