move bernoulli kernel to pten

zhiqiu · Feb 15, 2022 · 6b48fdc · 6b48fdc
1 parent 3e7825f
commit 6b48fdc
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 112 deletions.
diff --git a/paddle/fluid/operators/bernoulli_op.cc b/paddle/fluid/operators/bernoulli_op.cc
@@ -49,30 +49,6 @@ class BernoulliOp : public framework::OperatorWithKernel {
   }
 };
 
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename T>
-class BernoulliOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    auto *in_data = x->data<T>();
-    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int64_t size = x->numel();
-    std::uniform_real_distribution<T> dist(0.0, 1.0);
-    auto gen_ptr = framework::DefaultCPUGenerator();
-    auto engine = gen_ptr->GetCPUEngine();
-
-    for (int64_t i = 0; i < size; ++i) {
-      out_data[i] = BernoulliFunctor(in_data[i], dist(*engine));
-    }
-  }
-};  // namespace operators
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -82,7 +58,3 @@ REGISTER_OPERATOR(
     bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(bernoulli,
-                       ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
-                       ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
@@ -141,6 +141,53 @@ struct Transform<platform::CUDADeviceContext> {
 #endif
   }
 };
+
+template <>
+struct Transform<pten::GPUContext> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const pten::GPUContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "The CUDA Transform must be used in GPU place."));
+#ifdef __HIPCC__
+    thrust::transform(thrust::hip::par.on(context.stream()),
+                      details::CastToCUDATransformIterator(first),
+                      details::CastToCUDATransformIterator(last),
+                      details::CastToCUDATransformIterator(result), op);
+#else
+    thrust::transform(thrust::cuda::par.on(context.stream()),
+                      details::CastToCUDATransformIterator(first),
+                      details::CastToCUDATransformIterator(last),
+                      details::CastToCUDATransformIterator(result), op);
+#endif
+  }
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const pten::GPUContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "The CUDA Transform must be used in GPU place."));
+#ifdef __HIPCC__
+    thrust::transform(thrust::hip::par.on(context.stream()),
+                      details::CastToCUDATransformIterator(first1),
+                      details::CastToCUDATransformIterator(last1),
+                      details::CastToCUDATransformIterator(first2),
+                      details::CastToCUDATransformIterator(result), op);
+#else
+    thrust::transform(thrust::cuda::par.on(context.stream()),
+                      details::CastToCUDATransformIterator(first1),
+                      details::CastToCUDATransformIterator(last1),
+                      details::CastToCUDATransformIterator(first2),
+                      details::CastToCUDATransformIterator(result), op);
+#endif
+  }
+};
 #endif
 
 }  // namespace platform

diff --git a/paddle/pten/kernels/bernoulli_kernel.h b/paddle/pten/kernels/bernoulli_kernel.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/device_context.h"
+
+namespace pten {
+
+template <typename T>
+inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
+  PADDLE_ENFORCE_LE(p,
+                    1.0,
+                    pten::errors::OutOfRange(
+                        "The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(p,
+                    0.0,
+                    pten::errors::OutOfRange(
+                        "The probability should be >= 0, but got %f", p));
+  return static_cast<T>(rand < p);
+}
+
+template <typename T, typename Context>
+void BernoulliKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/bernoulli_kernel.cc b/paddle/pten/kernels/cpu/bernoulli_kernel.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/bernoulli_kernel.h"
+#include <random>
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void BernoulliKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+
+  std::uniform_real_distribution<T> dist(0.0, 1.0);
+  auto gen_ptr = ctx.GetGenerator();
+  auto engine = gen_ptr->GetCPUEngine();
+
+  for (int64_t i = 0; i < numel; ++i) {
+    out_data[i] = BernoulliFunctor(x_data[i], dist(*engine));
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(
+    bernoulli, CPU, ALL_LAYOUT, pten::BernoulliKernel, float, double) {}
diff --git a/paddle/pten/kernels/gpu/bernoulli_kernel.cu b/paddle/pten/kernels/gpu/bernoulli_kernel.cu
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/bernoulli_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace pten {
+
+template <typename T>
+struct BernoulliCudaFunctor {
+  unsigned int seed_;
+  unsigned int offset_;
+  __host__ __device__ BernoulliCudaFunctor(unsigned int seed,
+                                           unsigned int offset)
+      : seed_(seed), offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n, const T p) const {
+    // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
+    // lines of error messages if, and it should be refined.
+    PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
+                   "The probability should be >=0 and <= 1, but got %f",
+                   p);
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+    rng.discard(n + offset_);
+    return static_cast<T>(dist(rng) < p);
+  }
+};
+
+template <typename T, typename Context>
+void BernoulliKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+
+  auto gen_cuda = ctx.GetGenerator();
+  auto seed_offset = gen_cuda->IncrementOffset(1);
+  int64_t gen_offset = numel * seed_offset.second;
+  paddle::platform::Transform<pten::GPUContext> trans;
+  thrust::counting_iterator<int64_t> index_sequence_begin(0);
+  trans(ctx,
+        index_sequence_begin,
+        index_sequence_begin + numel,
+        x_data,
+        out_data,
+        BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
+                                static_cast<int64_t>(gen_offset)));
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(
+    bernoulli, GPU, ALL_LAYOUT, pten::BernoulliKernel, float, double) {}