Skip to content

Commit

Permalink
[PTEN] Add Gpu context (PaddlePaddle#39305)
Browse files Browse the repository at this point in the history
  • Loading branch information
jiweibo authored Feb 6, 2022
1 parent dcff7fa commit a821c4a
Show file tree
Hide file tree
Showing 135 changed files with 4,438 additions and 770 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/distributed/common/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ namespace distributed {
template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas() {
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(cpu_ctx);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
auto &t_latest = var_latest->Get<framework::LoDTensor>();
auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();

auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
Expand Down Expand Up @@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
RpcRecvDense(varnames, table_id, pserver_scope_.get());

// 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
for (auto &varname : varnames) {
auto *var_latest = recv_scope_->FindVar(varname);
auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
Expand Down Expand Up @@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
auto *t_old = var_old->GetMutable<framework::LoDTensor>();

auto dims1 = t_latest.dims()[1];
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;

auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
Expand Down Expand Up @@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
std::vector<float> v_delta;
v_delta.resize(numel);

auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
}

// set output tensor to 0.
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
constant_functor;
constant_functor(cpu_ctx, out_t, static_cast<T>(0));
Expand All @@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
for (auto &var : vars) {
inputs.push_back(&var->Get<pten::SelectedRows>());
}
auto dev_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext dev_ctx;
if (merge_add) {
paddle::operators::math::scatter::MergeAdd<
paddle::platform::CPUDeviceContext, T>
Expand Down
5 changes: 4 additions & 1 deletion paddle/fluid/framework/data_type_transform_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place);

context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/parallel_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
pool.Get(member_->places_[dev_id]));
auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
dev_ctx->set_bkcl_context(bkcl_ctx.comm());
dev_ctx->SetBkclContext(bkcl_ctx.comm());
}
#else
PADDLE_THROW(
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/framework/pten_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
using TYPE = pten::CPUContext;
};

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct ConvertToPtenContext<platform::CUDADeviceContext> {
using TYPE = pten::GPUContext;
};
#endif

#ifdef PADDLE_WITH_XPU
template <>
struct ConvertToPtenContext<platform::XPUDeviceContext> {
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.seekg(seekg, is.cur);

void* buf;
auto ctx = platform::CPUDeviceContext();
platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) ||
Expand Down Expand Up @@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void* buf;
auto ctx = platform::CPUDeviceContext();
platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) ||
Expand Down Expand Up @@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
return os;
}
}
} // namespace pten
20 changes: 20 additions & 0 deletions paddle/fluid/framework/tensor_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
// CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

// GPU Tensor to CPU Tensor
Expand Down Expand Up @@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace();
paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
Expand Down Expand Up @@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);

std::vector<int> dst;
Expand Down Expand Up @@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);

std::vector<bool> dst;
Expand Down Expand Up @@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {

auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();

TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/imperative/gloo_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
gloo_wrapper->Init();
device_ = std::unique_ptr<platform::CPUDeviceContext>(
new platform::CPUDeviceContext(platform::CPUPlace()));
device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
device_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
}

void GLOOParallelContext::InitWithRingID(int ring_id) {
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/inference/lite/test_engine_lite.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
Expand Down
12 changes: 12 additions & 0 deletions paddle/fluid/inference/tensorrt/test_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
ctx_->PartialInitWithAllocator();

engine_ = new TensorRTEngine(10, 1 << 10);
engine_->InitNetwork();
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/memory/allocation/best_fit_allocator_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <vector>

#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
Expand All @@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {

platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu);
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu, dev_ctx.stream())
.get());
dev_ctx.PartialInitWithAllocator();

auto th_main = [&](std::random_device::result_type seed) {
std::default_random_engine engine(seed);
Expand Down
35 changes: 31 additions & 4 deletions paddle/fluid/memory/malloc_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <vector>

#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"

Expand Down Expand Up @@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
main_stream_alloc_ptr.reset();

for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place)));
auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
}

Expand Down Expand Up @@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
main_stream_alloc_ptr.reset();

for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place)));
auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
std::cref(*dev_ctx[i])));
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/arg_min_max_op_base.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
return block_size;
};

int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post;
int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/argsort_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,

int block_size = ComputeBlockSize(num_cols);

int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
// Init a index array
Expand Down Expand Up @@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,

int block_size = ComputeBlockSize(num_cols);

int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/broadcast_tensors_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
// reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input_tensor, output_tensor, kps::IdentityFunctor<T>(),
reduce_dims_vec, stream);
context.cuda_device_context(), *input_tensor, output_tensor,
kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/cholesky_solve_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
}
};

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/clip_by_norm_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
{1}, dev_ctx);
TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
kps::SquareFunctor<platform::float16, float>>(
*input, &tmp, kps::SquareFunctor<platform::float16, float>(),
dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
reduce_dims, dev_ctx.stream());
auto tmp_eigen = EigenVector<float>::Flatten(tmp);
auto x_norm = tmp_eigen.sqrt();
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/controlflow/compare_all_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ class CompareReduceOpKernel
auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
kps::IdentityFunctor<bool>>(
tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream);
context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
reduce_dims, stream);
}
}
};
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/operators/copy_cross_scope_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
TEST(copy_cross_scope, CUDA_fp32) {
f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare1<float>(&scope, ctx, "copy_cross_scope");
}

TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare2<float>(&scope, ctx, "copy_cross_scope");
}
#elif PADDLE_WITH_ASCEND_CL
Expand Down
Loading

0 comments on commit a821c4a

Please sign in to comment.