From a821c4a941d1fdf698bf6ac632c90a02c103502b Mon Sep 17 00:00:00 2001 From: Wilber Date: Sun, 6 Feb 2022 11:05:25 +0800 Subject: [PATCH] [PTEN] Add Gpu context (#39305) --- paddle/fluid/distributed/common/utils.h | 2 +- .../ps/service/communicator/communicator.cc | 8 +- .../ps/service/communicator/communicator.h | 4 +- .../framework/data_type_transform_test.cu | 5 +- paddle/fluid/framework/parallel_executor.cc | 2 +- paddle/fluid/framework/pten_utils.h | 7 + paddle/fluid/framework/tensor_util.cc | 6 +- paddle/fluid/framework/tensor_util_test.cc | 20 + paddle/fluid/imperative/gloo_context.cc | 11 + .../fluid/inference/lite/test_engine_lite.cc | 4 + .../fluid/inference/tensorrt/test_engine.cc | 12 + .../allocation/best_fit_allocator_test.cu | 5 + paddle/fluid/memory/malloc_test.cu | 35 +- .../fluid/operators/arg_min_max_op_base.cu.h | 2 +- paddle/fluid/operators/argsort_op.cu | 4 +- .../fluid/operators/broadcast_tensors_op.cu | 4 +- paddle/fluid/operators/cholesky_solve_op.cu | 3 +- paddle/fluid/operators/clip_by_norm_op.cu | 2 +- .../operators/controlflow/compare_all_op.cu | 3 +- .../fluid/operators/copy_cross_scope_test.cc | 8 + .../elementwise/elementwise_mul_op.cu | 5 +- .../elementwise/elementwise_op_function.h | 3 +- paddle/fluid/operators/feed_forward_test.cu | 12 + paddle/fluid/operators/fused/attn_gemm.h | 3 +- paddle/fluid/operators/gelu_op.cu | 4 +- .../fluid/operators/graph_khop_sampler_op.cu | 4 +- paddle/fluid/operators/graph_send_recv_op.cu | 4 +- paddle/fluid/operators/gumbel_softmax_op.cu | 2 +- paddle/fluid/operators/index_sample_op.cu | 6 +- paddle/fluid/operators/interpolate_v2_op.cu | 8 +- paddle/fluid/operators/kron_op.h | 4 +- paddle/fluid/operators/kthvalue_op.cu | 2 +- .../operators/lite/lite_engine_op_test.cc | 4 + .../operators/margin_cross_entropy_op.cu | 4 +- .../fluid/operators/math/beam_search_test.cc | 79 ++ paddle/fluid/operators/math/blas_impl.cu.h | 577 +++++++++++ paddle/fluid/operators/math/blas_impl.hip.h | 406 ++++++++ paddle/fluid/operators/math/concat_test.cc | 27 + .../math/cusparse_conversion_api_test.cc | 10 + paddle/fluid/operators/math/im2col_test.cc | 162 ++++ paddle/fluid/operators/math/inclusive_scan.h | 2 +- paddle/fluid/operators/math/math_function.cu | 13 + .../operators/math/math_function_test.cc | 1 + .../operators/math/math_function_test.cu | 36 + paddle/fluid/operators/math/vol2col_test.cc | 121 ++- paddle/fluid/operators/mean_op.cu | 3 +- .../fluid/operators/nccl/nccl_op_test.cu.cc | 7 +- paddle/fluid/operators/p_norm_op.cu | 12 +- paddle/fluid/operators/pool_op.h | 4 +- paddle/fluid/operators/prelu_op.cu | 3 +- .../pscore/send_and_recv_op_gpu_test.cc | 4 + .../fluid/operators/reduce_ops/reduce_op.cu.h | 6 +- paddle/fluid/operators/renorm_op.cu | 10 +- paddle/fluid/operators/reshape_op.cc | 9 +- paddle/fluid/operators/scatter.cu.h | 3 +- paddle/fluid/operators/solve_op.h | 3 +- paddle/fluid/operators/strided_memcpy_test.cc | 9 + .../tensorrt/tensorrt_engine_op_test.cc | 12 + paddle/fluid/operators/top_k_function_cuda.h | 2 +- paddle/fluid/operators/trace_op.cu | 3 +- paddle/fluid/operators/triangular_solve_op.cu | 3 +- paddle/fluid/operators/viterbi_decode_op.cu | 2 +- paddle/fluid/platform/CMakeLists.txt | 1 + paddle/fluid/platform/bfloat16_test.cu | 4 + paddle/fluid/platform/collective_helper.cc | 15 +- .../fluid/platform/device/gpu/CMakeLists.txt | 4 +- .../platform/device/gpu/cuda/CMakeLists.txt | 1 - .../platform/device/gpu/cuda/cuda_helper.h | 5 +- .../device/gpu/cuda/cusparse_helper.h | 6 +- paddle/fluid/platform/device/gpu/gpu_info.cc | 107 ++- paddle/fluid/platform/device/gpu/gpu_info.h | 5 +- .../fluid/platform/device/gpu/nccl_helper.h | 17 +- .../platform/device/gpu/rocm/CMakeLists.txt | 2 - paddle/fluid/platform/device_context.cc | 229 ++--- paddle/fluid/platform/device_context.h | 156 ++- paddle/fluid/platform/device_context_test.cu | 28 + paddle/fluid/platform/device_event_gpu.cc | 5 +- paddle/fluid/platform/device_event_test.cc | 5 +- paddle/fluid/platform/enforce.h | 4 +- paddle/fluid/platform/event.h | 6 +- paddle/fluid/platform/float16_test.cu | 4 + paddle/fluid/platform/for_range.h | 36 + paddle/fluid/platform/stream/cuda_stream.cc | 16 +- paddle/fluid/platform/stream/cuda_stream.h | 16 +- paddle/fluid/platform/transform_test.cu | 9 + paddle/fluid/pybind/cuda_streams_py.cc | 8 +- paddle/fluid/pybind/pybind.cc | 46 +- paddle/pten/backends/CMakeLists.txt | 8 + paddle/pten/backends/cpu/cpu_context.cc | 68 +- paddle/pten/backends/cpu/cpu_context.h | 34 +- paddle/pten/backends/gpu/CMakeLists.txt | 9 + paddle/pten/backends/gpu/cuda/CMakeLists.txt | 1 + paddle/pten/backends/gpu/cuda/cuda_helper.h | 72 ++ .../backends}/gpu/cuda/cuda_info.cc | 131 ++- paddle/pten/backends/gpu/forwards.h | 111 +++ paddle/pten/backends/gpu/gpu_context.cc | 899 ++++++++++++++++++ paddle/pten/backends/gpu/gpu_context.h | 165 +++- paddle/pten/backends/gpu/gpu_decls.h | 70 ++ paddle/pten/backends/gpu/gpu_helper.h | 26 + paddle/pten/backends/gpu/gpu_info.cc | 60 ++ paddle/pten/backends/gpu/gpu_info.h | 132 +++ paddle/pten/backends/gpu/gpu_launch_config.h | 181 ++++ paddle/pten/backends/gpu/gpu_types.h | 73 ++ paddle/pten/backends/gpu/rocm/CMakeLists.txt | 1 + paddle/pten/backends/gpu/rocm/rocm_helper.h | 72 ++ .../backends}/gpu/rocm/rocm_info.cc | 128 ++- paddle/pten/backends/xpu/xpu_context.cc | 102 +- paddle/pten/backends/xpu/xpu_context.h | 32 +- paddle/pten/core/device_context.cc | 14 +- paddle/pten/core/device_context.h | 6 +- .../pten/kernels/funcs/cuda_kernel_config.h | 3 +- paddle/pten/kernels/funcs/elementwise_base.h | 5 +- paddle/pten/kernels/funcs/transpose.cu | 1 + paddle/pten/kernels/gpu/concat_and_split.h | 11 +- paddle/pten/kernels/gpu/copy_kernel.cu | 20 +- paddle/pten/kernels/gpu/elementwise.h | 8 +- paddle/pten/kernels/gpu/reduce.h | 21 +- .../kernels/impl/matmul_grad_kernel_impl.h | 8 +- paddle/pten/tests/api/test_matmul_api.cc | 3 +- .../pten/tests/api/test_sparse_utils_api.cc | 1 + paddle/pten/tests/core/test_device_context.cc | 32 +- .../pten/tests/kernels/test_cast_dev_api.cc | 8 +- .../pten/tests/kernels/test_concat_dev_api.cc | 8 +- .../pten/tests/kernels/test_conj_dev_api.cc | 8 +- .../pten/tests/kernels/test_copy_dev_api.cc | 8 +- .../tests/kernels/test_creation_dev_api.cc | 32 +- paddle/pten/tests/kernels/test_dot_dev_api.cc | 8 +- .../tests/kernels/test_elementwise_dev_api.cc | 32 +- .../tests/kernels/test_flatten_dev_api.cc | 8 +- .../pten/tests/kernels/test_matmul_dev_api.cc | 8 +- .../pten/tests/kernels/test_mean_dev_api.cc | 8 +- .../tests/kernels/test_reshape_dev_api.cc | 8 +- .../pten/tests/kernels/test_scale_dev_api.cc | 18 +- .../kernels/test_sparse_utils_dev_api.cc | 61 +- paddle/pten/tests/kernels/test_sum_dev_api.cc | 9 +- 135 files changed, 4438 insertions(+), 770 deletions(-) create mode 100644 paddle/pten/backends/gpu/CMakeLists.txt create mode 100644 paddle/pten/backends/gpu/cuda/CMakeLists.txt create mode 100644 paddle/pten/backends/gpu/cuda/cuda_helper.h rename paddle/{fluid/platform/device => pten/backends}/gpu/cuda/cuda_info.cc (71%) create mode 100644 paddle/pten/backends/gpu/forwards.h create mode 100644 paddle/pten/backends/gpu/gpu_context.cc create mode 100644 paddle/pten/backends/gpu/gpu_decls.h create mode 100644 paddle/pten/backends/gpu/gpu_helper.h create mode 100644 paddle/pten/backends/gpu/gpu_info.cc create mode 100644 paddle/pten/backends/gpu/gpu_info.h create mode 100644 paddle/pten/backends/gpu/gpu_launch_config.h create mode 100644 paddle/pten/backends/gpu/gpu_types.h create mode 100644 paddle/pten/backends/gpu/rocm/CMakeLists.txt create mode 100644 paddle/pten/backends/gpu/rocm/rocm_helper.h rename paddle/{fluid/platform/device => pten/backends}/gpu/rocm/rocm_info.cc (72%) diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h index fb2189b8f5a1b..85b89d75b98b6 100644 --- a/paddle/fluid/distributed/common/utils.h +++ b/paddle/fluid/distributed/common/utils.h @@ -33,7 +33,7 @@ namespace distributed { template inline paddle::operators::math::BlasT GetBlas() { - auto cpu_ctx = paddle::platform::CPUDeviceContext(); + paddle::platform::CPUDeviceContext cpu_ctx; return paddle::operators::math::GetBlas(cpu_ctx); } diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc index 3f1667e5344d6..99973ee8bdd74 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc @@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) { auto &t_latest = var_latest->Get(); auto t_timestamp = var_timestamp->GetMutable(); - auto cpu_ctx = paddle::platform::CPUDeviceContext(); + paddle::platform::CPUDeviceContext cpu_ctx; auto *var_delta = delta_scope_->Var(varname); auto *t_delta = var_delta->GetMutable(); t_delta->mutable_data(t_latest.dims(), cpu_ctx.GetPlace()); @@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) { RpcRecvDense(varnames, table_id, pserver_scope_.get()); // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver - auto cpu_ctx = paddle::platform::CPUDeviceContext(); + paddle::platform::CPUDeviceContext cpu_ctx; for (auto &varname : varnames) { auto *var_latest = recv_scope_->FindVar(varname); auto t_latest = var_latest->GetMutable(); @@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname, auto *t_old = var_old->GetMutable(); auto dims1 = t_latest.dims()[1]; - auto cpu_ctx = paddle::platform::CPUDeviceContext(); + paddle::platform::CPUDeviceContext cpu_ctx; auto *var_delta = delta_scope_->Var(varname); auto *t_delta = var_delta->GetMutable(); @@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id, std::vector v_delta; v_delta.resize(numel); - auto cpu_ctx = paddle::platform::CPUDeviceContext(); + paddle::platform::CPUDeviceContext cpu_ctx; auto blas = paddle::operators::math::GetBlas( cpu_ctx); diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h index c63f341607439..da4e2f1a12898 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.h +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h @@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name, } // set output tensor to 0. - auto cpu_ctx = paddle::platform::CPUDeviceContext(); + paddle::platform::CPUDeviceContext cpu_ctx; paddle::operators::math::SetConstant constant_functor; constant_functor(cpu_ctx, out_t, static_cast(0)); @@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name, for (auto &var : vars) { inputs.push_back(&var->Get()); } - auto dev_ctx = paddle::platform::CPUDeviceContext(); + paddle::platform::CPUDeviceContext dev_ctx; if (merge_add) { paddle::operators::math::scatter::MergeAdd< paddle::platform::CPUDeviceContext, T> diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu index 0874509a8797c..6b84fdf0ec9a6 100644 --- a/paddle/fluid/framework/data_type_transform_test.cu +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) { auto cpu_place = paddle::platform::CPUPlace(); auto gpu_place = paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(gpu_place); - + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); auto kernel_fp16 = paddle::framework::OpKernelType( paddle::framework::proto::VarType::FP16, gpu_place, paddle::framework::DataLayout::kAnyLayout, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8767b55062cda..7eebfb904cf66 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { auto *dev_ctx = static_cast( pool.Get(member_->places_[dev_id])); auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); - dev_ctx->set_bkcl_context(bkcl_ctx.comm()); + dev_ctx->SetBkclContext(bkcl_ctx.comm()); } #else PADDLE_THROW( diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h index 2d335fc9c9894..ae0388079d280 100644 --- a/paddle/fluid/framework/pten_utils.h +++ b/paddle/fluid/framework/pten_utils.h @@ -77,6 +77,13 @@ struct ConvertToPtenContext { using TYPE = pten::CPUContext; }; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template <> +struct ConvertToPtenContext { + using TYPE = pten::GPUContext; +}; +#endif + #ifdef PADDLE_WITH_XPU template <> struct ConvertToPtenContext { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index dff4879096056..b926a3cc7659b 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.seekg(seekg, is.cur); void* buf; - auto ctx = platform::CPUDeviceContext(); + platform::CPUDeviceContext ctx; size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || @@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); tensor->Resize(framework::make_ddim(dims)); void* buf; - auto ctx = platform::CPUDeviceContext(); + platform::CPUDeviceContext ctx; size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || @@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) { VLOG(1) << "PrintVar: unrecognized data type:" << t.type(); return os; } -} +} // namespace pten diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index 8587ee8d1e919..fbbf29254d0f9 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) { // CPU Tensor to GPU Tensor auto gpu_place = new platform::CUDAPlace(0); platform::CUDADeviceContext gpu_ctx(*gpu_place); + gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(*gpu_place, gpu_ctx.stream()) + .get()); + gpu_ctx.PartialInitWithAllocator(); TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); // GPU Tensor to CPU Tensor @@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) { gpu_tensor.Resize(paddle::framework::make_ddim({3, 3})); auto gpu_place = new paddle::platform::CUDAPlace(); paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place); + gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(*gpu_place, gpu_ctx.stream()) + .get()); + gpu_ctx.PartialInitWithAllocator(); paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); // Copy from GPU to CPU tensor for comparison paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); @@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) { paddle::framework::Tensor gpu_tensor; paddle::platform::CUDAPlace place; paddle::platform::CUDADeviceContext gpu_ctx(place); + gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, gpu_ctx.stream()) + .get()); + gpu_ctx.PartialInitWithAllocator(); paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); std::vector dst; @@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) { paddle::framework::Tensor gpu_tensor; paddle::platform::CUDAPlace place; paddle::platform::CUDADeviceContext gpu_ctx(place); + gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, gpu_ctx.stream()) + .get()); + gpu_ctx.PartialInitWithAllocator(); paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); std::vector dst; @@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) { auto gpu_place = new platform::CUDAPlace(); platform::CUDADeviceContext gpu_ctx(*gpu_place); + gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(*gpu_place, gpu_ctx.stream()) + .get()); + gpu_ctx.PartialInitWithAllocator(); TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index 44315e267ee78..eeac500cac413 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -46,6 +46,17 @@ void GLOOParallelContext::Init() { gloo_wrapper->Init(); device_ = std::unique_ptr( new platform::CPUDeviceContext(platform::CPUPlace())); + device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(platform::CPUPlace()) + .get()); + device_->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + device_->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(platform::CPUPlace()) + .get()); } void GLOOParallelContext::InitWithRingID(int ring_id) { diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc index b2750fd070d3e..85f7d3ee363a7 100644 --- a/paddle/fluid/inference/lite/test_engine_lite.cc +++ b/paddle/fluid/inference/lite/test_engine_lite.cc @@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); #else platform::CPUPlace place; platform::CPUDeviceContext ctx(place); diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index d5cc69ea661d9..fa03d5320f2e7 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) + .get()); + ctx_->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + ctx_->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(platform::CUDAPlace(0)) + .get()); + ctx_->PartialInitWithAllocator(); engine_ = new TensorRTEngine(10, 1 << 10); engine_->InitNetwork(); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu index 59c14103ca67d..cad31f5bba91b 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu @@ -18,6 +18,7 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h" @@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) { platform::CUDAPlace gpu(0); platform::CUDADeviceContext dev_ctx(gpu); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu, dev_ctx.stream()) + .get()); + dev_ctx.PartialInitWithAllocator(); auto th_main = [&](std::random_device::result_type seed) { std::default_random_engine engine(seed); diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index d015ed7ce693f..07577531d6459 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -25,6 +25,7 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device_context.h" @@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) { main_stream_alloc_ptr.reset(); for (int i = 0; i < NUM_STREAMS; ++i) { - dev_ctx.push_back(std::unique_ptr( - new platform::CUDADeviceContext(place))); + auto ctx = std::unique_ptr( + new platform::CUDADeviceContext(place)); + ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx->stream()) + .get()); + ctx->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + ctx->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(place) + .get()); + ctx->PartialInitWithAllocator(); + dev_ctx.emplace_back(std::move(ctx)); MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]); } @@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { main_stream_alloc_ptr.reset(); for (int i = 0; i < NUM_STREAMS; ++i) { - dev_ctx.push_back(std::unique_ptr( - new platform::CUDADeviceContext(place))); + auto ctx = std::unique_ptr( + new platform::CUDADeviceContext(place)); + ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx->stream()) + .get()); + ctx->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + ctx->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(place) + .get()); + ctx->PartialInitWithAllocator(); + dev_ctx.emplace_back(std::move(ctx)); threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i], std::cref(*dev_ctx[i]))); } diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h index 2c34d6f8300a7..89ba2dfb9250b 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h @@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, return block_size; }; - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t height = pre * post; int64_t width = n; int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu index 6236a07de4bc6..b6a1f1f6d2358 100644 --- a/paddle/fluid/operators/argsort_op.cu +++ b/paddle/fluid/operators/argsort_op.cu @@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, int block_size = ComputeBlockSize(num_cols); - int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; + int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; // actually, int num_rows < max_grid_size int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; // Init a index array @@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, int block_size = ComputeBlockSize(num_cols); - int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; + int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; // actually, int num_rows < max_grid_size int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; FillGrad<<>>( diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu index 718e7ce396621..5a835c7678fa2 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cu +++ b/paddle/fluid/operators/broadcast_tensors_op.cu @@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel { // reduce_sum implementation on CUDA auto stream = context.cuda_device_context().stream(); TensorReduceFunctorImpl>( - *input_tensor, output_tensor, kps::IdentityFunctor(), - reduce_dims_vec, stream); + context.cuda_device_context(), *input_tensor, output_tensor, + kps::IdentityFunctor(), reduce_dims_vec, stream); } } } diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu index f42364c9619ef..a5d5baf19dad0 100644 --- a/paddle/fluid/operators/cholesky_solve_op.cu +++ b/paddle/fluid/operators/cholesky_solve_op.cu @@ -115,7 +115,8 @@ class MatrixReduceSumFunctor { } gpuStream_t stream = ctx.cuda_device_context().stream(); TensorReduceFunctorImpl>( - in, out, kps::IdentityFunctor(), out_reduce_dims, stream); + ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), + out_reduce_dims, stream); } }; diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu index 4d04fdc8ce2d2..f2714d1378510 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cu +++ b/paddle/fluid/operators/clip_by_norm_op.cu @@ -77,7 +77,7 @@ class ClipByNormKernel {1}, dev_ctx); TensorReduceFunctorImpl>( - *input, &tmp, kps::SquareFunctor(), + dev_ctx, *input, &tmp, kps::SquareFunctor(), reduce_dims, dev_ctx.stream()); auto tmp_eigen = EigenVector::Flatten(tmp); auto x_norm = tmp_eigen.sqrt(); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu index 54f59c40a205d..6526d774cafe8 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ b/paddle/fluid/operators/controlflow/compare_all_op.cu @@ -65,7 +65,8 @@ class CompareReduceOpKernel auto stream = context.cuda_device_context().stream(); TensorReduceFunctorImpl>( - tmp, z, kps::IdentityFunctor(), reduce_dims, stream); + context.cuda_device_context(), tmp, z, kps::IdentityFunctor(), + reduce_dims, stream); } } }; diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc index 5f951ad337e8e..95e30efda0f78 100644 --- a/paddle/fluid/operators/copy_cross_scope_test.cc +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx, TEST(copy_cross_scope, CUDA_fp32) { f::Scope scope; p::CUDADeviceContext ctx(p::CUDAPlace(0)); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(p::CUDAPlace(0), ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); Compare1(&scope, ctx, "copy_cross_scope"); } TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { f::Scope scope; p::CUDADeviceContext ctx(p::CUDAPlace(0)); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(p::CUDAPlace(0), ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); Compare2(&scope, ctx, "copy_cross_scope"); } #elif PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 8923f1fd4b866..56580e6d59525 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" +#include "paddle/pten/backends/gpu/gpu_context.h" namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -51,8 +52,8 @@ class ElementwiseMulKernel auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); - pten::MultiplyRawKernel(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, - pt_z.get()); + pten::MultiplyRawKernel(static_cast(cuda_ctx), + *pt_x.get(), *pt_y.get(), axis, pt_z.get()); } else { PADDLE_THROW(platform::errors::InvalidArgument( "X's type[%s] is not supported by elementwise_op. X's type should be " diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index fdf04181de76c..74e74870b8e11 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, framework::Tensor *src, framework::Tensor *dst) { std::vector reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); TensorReduceFunctorImpl>( - *src, dst, kps::IdentityFunctor(), reduce_dims, dev_ctx.stream()); + dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims, + dev_ctx.stream()); } template diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 9f44c39a92c5e..dea427393b175 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -275,6 +275,18 @@ class TestFeedForward { output_size_ = 3 * num_head_ * dim_head_; input_size_ = dim_embed_; ctx_ = new platform::CUDADeviceContext(place_); + ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place_, ctx_->stream()) + .get()); + ctx_->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + ctx_->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(place_) + .get()); + ctx_->PartialInitWithAllocator(); size_src_ = bsz_seq_ * dim_embed_; // src: [bs, seq_len, em_dim] size_weight_ = dim_embed_ * output_size_; // weight: [output_size, em_dim] diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h index b9b881cf83e0b..1128997fd259a 100644 --- a/paddle/fluid/operators/fused/attn_gemm.h +++ b/paddle/fluid/operators/fused/attn_gemm.h @@ -166,7 +166,8 @@ class AttnMatMul { if (support_case_1 || support_case_2) { gpuStream_t stream = dev_ctx_.stream(); TensorReduceFunctorImpl>( - *d_output, d_bias, kps::IdentityFunctor(), {0, 1}, stream); + dev_ctx_, *d_output, d_bias, kps::IdentityFunctor(), {0, 1}, + stream); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Only support reduce when the input dims are [0,1,2,3,4] and " diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 73694471799af..6b778eee43451 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( is_aligned(y, kAlignment)) { \ size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize().x); \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ << " , thread = " << thread; \ FP16FastGeluFwdCUDAKernel< \ @@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( is_aligned(x_g, kAlignment)) { \ size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize().x); \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ << " , thread = " << thread; \ FP16FastGeluBwdCUDAKernel< \ diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index 777ec64f6e008..df977b43512a0 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input, int block = 1024; #endif const auto& dev_ctx = ctx.cuda_device_context(); - int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; + int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int grid_tmp = (num_input + block - 1) / block; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; // 1. Insert data into keys and values. @@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx, int block = 1024; #endif const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; + int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (outputs->size() + block - 1) / block; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; ReindexSrcOutput< diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu index 6e5e203e2d943..446ad2d97a7fb 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cu +++ b/paddle/fluid/operators/graph_send_recv_op.cu @@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper( #endif int64_t n = slice_size * index_size; const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; + int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (n + block - 1) / block; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t input_size = src_dims[0]; @@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper( #endif int64_t n = slice_size * index_size; const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; + int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (n + block - 1) / block; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t input_size = src_dims[0]; diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu index 4e91e689fa58c..63577ed1e0f1f 100644 --- a/paddle/fluid/operators/gumbel_softmax_op.cu +++ b/paddle/fluid/operators/gumbel_softmax_op.cu @@ -92,7 +92,7 @@ struct OneHotGenerator { const int size_from_axis = SizeFromAxis(axis, X.dims()); const int size_out_axis = SizeOutAxis(axis, X.dims()); constexpr int thread_size = 512; - int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x; + int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0]; int64_t height = size_to_axis * size_out_axis; int block_size = height < max_grid_dimx ? height : max_grid_dimx; diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu index 45f63c2b2fbd8..4c9dec1400076 100644 --- a/paddle/fluid/operators/index_sample_op.cu +++ b/paddle/fluid/operators/index_sample_op.cu @@ -27,10 +27,10 @@ namespace operators { namespace { void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) { - dim3 max_grid_dim = ctx.template device_context() + auto max_grid_dim = ctx.template device_context() .GetCUDAMaxGridDimSize(); - grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x; - grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y; + grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; + grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; } } diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index a4d5e75e7289b..3901a2515450d 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( int block_y = std::min(GetLastPow2(height), max_threads / block_x); int block_z = std::min(num_img, max_threads / block_x / block_y); - dim3 max_grid_dim = context.GetCUDAMaxGridDimSize(); - int grid_x = std::min(max_grid_dim.x, platform::DivUp(width, block_x)); - int grid_y = std::min(max_grid_dim.y, platform::DivUp(height, block_y)); + auto max_grid_dim = context.GetCUDAMaxGridDimSize(); + int grid_x = std::min(max_grid_dim[0], platform::DivUp(width, block_x)); + int grid_y = std::min(max_grid_dim[1], platform::DivUp(height, block_y)); int grid_z = - std::min(max_grid_dim.z, platform::DivUp(num_img, block_z * 4)); + std::min(max_grid_dim[2], platform::DivUp(num_img, block_z * 4)); const int capability = context.GetComputeCapability(); platform::GpuLaunchConfig config; diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h index 6039d8c624052..ff3baf4d70f10 100644 --- a/paddle/fluid/operators/kron_op.h +++ b/paddle/fluid/operators/kron_op.h @@ -306,11 +306,11 @@ struct KronGradOpFunctor { auto stream = dev_ctx.stream(); // it is a cuda device_context if (dx) { TensorReduceFunctorImpl>( - dout_x, dx, kps::IdentityFunctor(), {1}, stream); + dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}, stream); } if (dy) { TensorReduceFunctorImpl>( - dout_y, dy, kps::IdentityFunctor(), {1}, stream); + dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}, stream); } #else auto* place = dev_ctx.eigen_device(); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu index c6c62a763aa06..0941f9d4c3b3b 100644 --- a/paddle/fluid/operators/kthvalue_op.cu +++ b/paddle/fluid/operators/kthvalue_op.cu @@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx, input_indices.mutable_data(ctx.GetPlace()); size_t temp_storage_bytes = -1; int block_size = getBlockSize(num_cols); - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; + unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; unsigned int grid_size = num_rows < maxGridDimX ? static_cast(num_rows) : maxGridDimX; diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index 053ba322d8f4d..01583cea31222 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); #else platform::CPUPlace place; platform::CPUDeviceContext ctx(place); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index e4fb4150f841b..a8a3390c00207 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { ctx.AllocateTmpTensor({N, 1}, dev_ctx); T* logits_max_buff = logits_max.mutable_data(place); TensorReduceFunctorImpl>( - softmax_2d, &logits_max, kps::IdentityFunctor(), {1}, + dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor(), {1}, dev_ctx.stream()); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { ctx.AllocateTmpTensor({N, 1}, dev_ctx); T* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); TensorReduceFunctorImpl>( - softmax_2d, &sum_exp_logits, kps::ExpFunctor(), {1}, + dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor(), {1}, dev_ctx.stream()); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc index ec2e9516fcd4b..e6f6a09a43b20 100644 --- a/paddle/fluid/operators/math/beam_search_test.cc +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" #include +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" void PrepareCPUTensors(paddle::framework::LoDTensor* ids, paddle::framework::LoDTensor* scores, @@ -129,6 +131,83 @@ void TestBeamSearch() { delete context; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template <> +void TestBeamSearch() { + paddle::framework::LoDTensor ids; + paddle::framework::LoDTensor scores; + paddle::framework::LoDTensor pre_ids; + paddle::framework::LoDTensor pre_scores; + + auto* place = new paddle::platform::CUDAPlace(); + auto* context = new paddle::platform::CUDADeviceContext(*place); + context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(*place, context->stream()) + .get()); + context->PartialInitWithAllocator(); + if (paddle::platform::is_cpu_place(*place)) { + PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores); + } else { + paddle::framework::LoDTensor cpu_ids; + paddle::framework::LoDTensor cpu_scores; + paddle::framework::LoDTensor cpu_pre_ids; + paddle::framework::LoDTensor cpu_pre_scores; + + PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores); + + paddle::framework::TensorCopySync(cpu_ids, *place, &ids); + paddle::framework::TensorCopySync(cpu_scores, *place, &scores); + paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids); + paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores); + + ids.set_lod(cpu_ids.lod()); + scores.set_lod(cpu_scores.lod()); + pre_ids.set_lod(cpu_pre_ids.lod()); + pre_scores.set_lod(cpu_pre_scores.lod()); + } + + paddle::framework::LoDTensor selected_ids; + paddle::framework::LoDTensor selected_scores; + paddle::framework::LoDTensor parent_idx; + + size_t level = 0; + size_t beam_size = 2; + int end_id = 0; + paddle::operators::math::BeamSearchFunctor< + paddle::platform::CUDADeviceContext, float> + beamsearch; + beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids, + &selected_scores, &parent_idx, level, beam_size, end_id, true); + + ASSERT_EQ(selected_ids.lod(), selected_scores.lod()); + + paddle::framework::LoDTensor cpu_selected_ids; + paddle::framework::LoDTensor cpu_selected_scores; + if (paddle::platform::is_cpu_place(*place)) { + cpu_selected_ids = selected_ids; + cpu_selected_scores = selected_scores; + } else { + paddle::framework::TensorCopySync( + selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids); + paddle::framework::TensorCopySync( + selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores); + cpu_selected_ids.set_lod(selected_ids.lod()); + cpu_selected_scores.set_lod(selected_scores.lod()); + } + + std::vector expected_ids({4, 5, 3, 8}); + std::vector expected_scores({0.6f, 0.5f, 0.9f, 0.7f}); + for (int i = 0; i < 4; i++) { + ASSERT_EQ(expected_ids[i], cpu_selected_ids.data()[i]); + ASSERT_EQ(expected_scores[i], cpu_selected_scores.data()[i]); + } + + delete place; + delete context; +} +#endif + TEST(BeamSearch, CPU) { TestBeamSearch(); diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 92162e639ff86..7ffd2a7ab2d84 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -18,6 +18,7 @@ #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/pten/backends/gpu/gpu_context.h" DECLARE_bool(enable_cublas_tensor_op_math); @@ -92,6 +93,32 @@ struct CUBlas { #endif } + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float *alpha, const void *A, cudaDataType_t Atype, + int lda, const void *B, cudaDataType_t Btype, int ldb, + const float *beta, void *C, cudaDataType_t Ctype, + int ldc) { +// Because the gcc 4.8 doesn't expand template parameter pack that +// appears in a lambda-expression, I can not use template parameter pack +// here. +#if CUDA_VERSION >= 8000 + VLOG(5) << "use_tensor_op_math: " + << (dev_ctx->tensor_core_available() ? "True" : "False"); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc)); + }); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "cublasSgemmEx is not supported on cuda <= 7.5")); +#endif + } + template static void TRSM(ARGS... args) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...)); @@ -273,6 +300,37 @@ struct CUBlas { "cublasGemmEx is not supported on cuda <= 7.5")); #endif } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const void *alpha, const void *A, cudaDataType_t Atype, + int lda, const void *B, cudaDataType_t Btype, int ldb, + const void *beta, void *C, cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, computeType, algo)); + }); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx is not supported on cuda <= 7.5")); +#endif + } }; template <> @@ -388,6 +446,37 @@ struct CUBlas> { #endif } + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const void *alpha, const void *A, cudaDataType_t Atype, + int lda, const void *B, cudaDataType_t Btype, int ldb, + const void *beta, void *C, cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, computeType, algo)); + }); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx is not supported on cuda <= 7.5")); +#endif + } + static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t transa, cublasDiagType_t diag, int m, int n, @@ -529,6 +618,37 @@ struct CUBlas> { "cublasGemmEx is not supported on cuda <= 7.5")); #endif } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const void *alpha, const void *A, cudaDataType_t Atype, + int lda, const void *B, cudaDataType_t Btype, int ldb, + const void *beta, void *C, cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, computeType, algo)); + }); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx is not supported on cuda <= 7.5")); +#endif + } }; template <> @@ -564,6 +684,39 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 8000 } +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, int K, + T alpha, const T *A, const T *B, T beta, + T *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, N); + } else { +#endif // CUDA_VERSION >= 8000 + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N); + }); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 +} + template <> template <> inline void Blas::GEMM( @@ -611,6 +764,55 @@ inline void Blas::GEMM( #endif // CUDA_VERSION >= 8000 } +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::float16 alpha, + const platform::float16 *A, + const platform::float16 *B, + platform::float16 beta, + platform::float16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 53, + platform::errors::InvalidArgument( + "cublas fp16 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + +#if CUDA_VERSION >= 8000 + // cublasHgemm does true FP16 computation which is slow for non-Volta + // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: + // input/output in fp16, computation in fp32, which can also be accelerated + // using tensor cores in volta GPUs. + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A, + CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); +#else + // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, + &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, + N); + }); +#endif // CUDA_VERSION >= 8000 +} + template <> template <> inline void Blas::GEMM( @@ -659,6 +861,56 @@ inline void Blas::GEMM( #endif // CUDA_VERSION >= 8000 } +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::complex alpha, + const platform::complex *A, + const platform::complex *B, + platform::complex beta, + platform::complex *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 53, + platform::errors::InvalidArgument( + "cublas complex64 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + thrust::complex c_alpha = + thrust::complex(alpha.real, alpha.imag); + thrust::complex c_beta = thrust::complex(beta.real, beta.imag); + +#if CUDA_VERSION >= 8000 + // cublasHgemm does true FP16 computation which is slow for non-Volta + // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: + // input/output in fp16, computation in fp32, which can also be accelerated + // using tensor cores in volta GPUs. + auto &cuda_ctx = const_cast(context_); + CUBlas>::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A, + CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F); +#else + // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas>::GEMM(handle, cuTransB, cuTransA, N, M, K, + &c_alpha, h_B, ldb, h_A, lda, + &c_beta, h_C, N); + }); +#endif // CUDA_VERSION >= 8000 +} + template <> template <> inline void Blas::GEMM( @@ -708,6 +960,57 @@ inline void Blas::GEMM( #endif // CUDA_VERSION >= 8000 } +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::complex alpha, + const platform::complex *A, + const platform::complex *B, + platform::complex beta, + platform::complex *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 53, + platform::errors::InvalidArgument( + "cublas complex128 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + thrust::complex c_alpha = + thrust::complex(alpha.real, alpha.imag); + thrust::complex c_beta = + thrust::complex(beta.real, beta.imag); + +#if CUDA_VERSION >= 8000 + // cublasHgemm does true FP16 computation which is slow for non-Volta + // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: + // input/output in fp16, computation in fp32, which can also be accelerated + // using tensor cores in volta GPUs. + auto &cuda_ctx = const_cast(context_); + CUBlas>::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A, + CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F); +#else + // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas>::GEMM(handle, cuTransB, cuTransA, N, M, K, + &c_alpha, h_B, ldb, h_A, lda, + &c_beta, h_C, N); + }); +#endif // CUDA_VERSION >= 8000 +} + template <> template void Blas::GEMM(bool transA, bool transB, int M, @@ -738,6 +1041,35 @@ void Blas::GEMM(bool transA, bool transB, int M, #endif // CUDA_VERSION >= 8000 } +template <> +template +void Blas::GEMM(bool transA, bool transB, int M, int N, int K, + T alpha, const T *A, int lda, const T *B, + int ldb, T beta, T *C, int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, ldc); + } else { +#endif // CUDA_VERSION >= 8000 + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc); + }); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 +} + template <> template <> inline void Blas::GEMM( @@ -755,6 +1087,25 @@ inline void Blas::GEMM( }); } +template <> +template <> +inline void Blas::GEMM(bool transA, bool transB, int M, int N, + int K, platform::float16 alpha, + const platform::float16 *A, int lda, + const platform::float16 *B, int ldb, + platform::float16 beta, + platform::float16 *C, int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, ldc); + }); +} + template <> template void Blas::AXPY(int n, T alpha, const T *x, @@ -764,6 +1115,14 @@ void Blas::AXPY(int n, T alpha, const T *x, }); } +template <> +template +void Blas::AXPY(int n, T alpha, const T *x, T *y) const { + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); + }); +} + template <> template void Blas::SCAL(int n, const T alpha, T *x) const { @@ -771,6 +1130,13 @@ void Blas::SCAL(int n, const T alpha, T *x) const { [&](cublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); } +template <> +template +void Blas::SCAL(int n, const T alpha, T *x) const { + context_.CublasCall( + [&](cublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); +} + template <> template void Blas::VCOPY(int n, const T *x, T *y) const { @@ -778,6 +1144,13 @@ void Blas::VCOPY(int n, const T *x, T *y) const { [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); } +template <> +template +void Blas::VCOPY(int n, const T *x, T *y) const { + context_.CublasCall( + [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); +} + template <> template void Blas::GEMV(bool trans_a, int M, int N, @@ -790,6 +1163,17 @@ void Blas::GEMV(bool trans_a, int M, int N, }); } +template <> +template +void Blas::GEMV(bool trans_a, int M, int N, T alpha, + const T *A, const T *B, T beta, T *C) const { + cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); +} + template <> template <> inline void Blas::GEMV( @@ -806,6 +1190,24 @@ inline void Blas::GEMV( } } +template <> +template <> +inline void Blas::GEMV(bool trans_a, int M, int N, + platform::float16 alpha, + const platform::float16 *A, + const platform::float16 *B, + platform::float16 beta, + platform::float16 *C) const { + // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} + template <> template void Blas::BatchedGEMM( @@ -854,6 +1256,56 @@ void Blas::BatchedGEMM( #endif // CUDA_VERSION >= 9010 } +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, T alpha, const T *A, const T *B, + T beta, T *C, int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int64_t strideC = M * N; + +#if CUDA_VERSION >= 9010 + if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same::value)) || + std::is_same::value) { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + auto fp = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( + handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A, + fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo)); + }); + } else { +#endif // CUDA_VERSION >= 9010 + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, strideB, A, lda, strideA, &beta, C, + ldc, strideC, batchCount); + }); + +#if CUDA_VERSION >= 9010 + } +#endif // CUDA_VERSION >= 9010 +} + template <> template void Blas::BatchedGEMM( @@ -865,6 +1317,19 @@ void Blas::BatchedGEMM( } } +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, T alpha, const T **A, + const T **B, T beta, T **C, + int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, A[k], B[k], beta, + C[k]); + } +} + template <> template <> inline void Blas::BatchedGEMM( @@ -878,6 +1343,19 @@ inline void Blas::BatchedGEMM( } } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::float16 alpha, const platform::float16 **A, + const platform::float16 **B, platform::float16 beta, platform::float16 **C, + int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, A[k], + B[k], beta, C[k]); + } +} + template <> template void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, @@ -903,6 +1381,30 @@ void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, }); } +template <> +template +void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, + CBLAS_TRANSPOSE transA, CBLAS_DIAG diag, + int M, int N, T alpha, const T *A, int lda, + T *B, int ldb) const { + // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` + // where ' stands for transpose + cublasSideMode_t cuSide = + (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; + cublasFillMode_t cuUplo = + (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; + // use CUBLAS_OP_C (conjugate transpose) for complex + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasDiagType_t cuDiag = + (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, + lda, B, ldb); + }); +} + template <> template void Blas::BatchedGETRF(int n, T **a, int *ipiv, @@ -913,6 +1415,15 @@ void Blas::BatchedGETRF(int n, T **a, int *ipiv, }); } +template <> +template +void Blas::BatchedGETRF(int n, T **a, int *ipiv, int *info, + int batch_size) const { + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); + }); +} + template <> template void Blas::BatchedGETRI(int n, const T **a, @@ -931,6 +1442,23 @@ void Blas::BatchedGETRI(int n, const T **a, }); } +template <> +template +void Blas::BatchedGETRI(int n, const T **a, const int *ipiv, + T **a_inv, int *info, + int batch_size) const { + PADDLE_ENFORCE_NE( + a_inv, a, + platform::errors::InvalidArgument( + "cuBLAS fuction 'cublasgetrfBatched' cannot be executed " + "in-place. The memory space of output matrix (address: %p) cannot " + "overlap memory space of input matrix (address: %p).", + a_inv, a)); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); + }); +} + template <> template void Blas::BatchedMatInv(int n, const T **a, @@ -941,6 +1469,15 @@ void Blas::BatchedMatInv(int n, const T **a, }); } +template <> +template +void Blas::BatchedMatInv(int n, const T **a, T **a_inv, + int *info, int batch_size) const { + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); + }); +} + template <> template void Blas::BatchedGETRS( @@ -955,6 +1492,21 @@ void Blas::BatchedGETRS( }); } +template <> +template +void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, int n, + int nrhs, const T **a, int lda, + int *ipiv, T **b, int ldb, int *info, + int batch_size) const { + // use CUBLAS_OP_C (conjugate transpose) for complex + cublasOperation_t cuTrans = + (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, + batch_size); + }); +} + template <> template void Blas::BatchedTRSM( @@ -979,6 +1531,31 @@ void Blas::BatchedTRSM( }); } +template <> +template +void Blas::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, + CBLAS_TRANSPOSE transA, + CBLAS_DIAG diag, int M, int N, T alpha, + const T **A, int lda, T **B, int ldb, + int batch_size) const { + // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` + // where ' stands for transpose + cublasSideMode_t cuSide = + (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; + cublasFillMode_t cuUplo = + (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; + // use CUBLAS_OP_C (conjugate transpose) for complex + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasDiagType_t cuDiag = + (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, + &alpha, A, lda, B, ldb, batch_size); + }); +} + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h index 32479189eea58..bf7d66f485327 100644 --- a/paddle/fluid/operators/math/blas_impl.hip.h +++ b/paddle/fluid/operators/math/blas_impl.hip.h @@ -17,6 +17,7 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/dynload/rocblas.h" +#include "paddle/pten/backends/gpu/gpu_context.h" DECLARE_bool(enable_cublas_tensor_op_math); @@ -221,6 +222,20 @@ struct CUBlas { beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); }); } + template + static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa, + rocblas_operation transb, int m, int n, int k, + const void *alpha, const void *A, rocblas_datatype Atype, + int lda, const void *B, rocblas_datatype Btype, int ldb, + const void *beta, void *C, rocblas_datatype Ctype, + int ldc, rocblas_datatype computeType) { + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); + }); + } }; template <> @@ -305,6 +320,20 @@ struct CUBlas> { beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); }); } + template + static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa, + rocblas_operation transb, int m, int n, int k, + const void *alpha, const void *A, rocblas_datatype Atype, + int lda, const void *B, rocblas_datatype Btype, int ldb, + const void *beta, void *C, rocblas_datatype Ctype, + int ldc, rocblas_datatype computeType) { + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); + }); + } }; template <> @@ -389,6 +418,20 @@ struct CUBlas> { beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); }); } + template + static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa, + rocblas_operation transb, int m, int n, int k, + const void *alpha, const void *A, rocblas_datatype Atype, + int lda, const void *B, rocblas_datatype Btype, int ldb, + const void *beta, void *C, rocblas_datatype Ctype, + int ldc, rocblas_datatype computeType) { + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); + }); + } }; template <> @@ -412,6 +455,27 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, &beta, C, N); }); } +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, int K, + T alpha, const T *A, const T *B, T beta, + T *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, + &beta, C, N); + }); +} template <> template <> @@ -448,6 +512,43 @@ inline void Blas::GEMM( rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C, rocblas_datatype_f16_r, N, rocblas_datatype_f32_r); } +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::float16 alpha, + const platform::float16 *A, + const platform::float16 *B, + platform::float16 beta, + platform::float16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 53, + platform::errors::InvalidArgument( + "cublas fp16 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, + rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C, + rocblas_datatype_f16_r, N, rocblas_datatype_f32_r); +} template <> template <> @@ -485,6 +586,44 @@ inline void Blas::GEMM( rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C, rocblas_datatype_f32_c, N, rocblas_datatype_f32_c); } +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::complex alpha, + const platform::complex *A, + const platform::complex *B, + platform::complex beta, + platform::complex *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 53, + platform::errors::InvalidArgument( + "cublas complex64 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + thrust::complex c_alpha = + thrust::complex(alpha.real, alpha.imag); + thrust::complex c_beta = thrust::complex(beta.real, beta.imag); + + auto &cuda_ctx = const_cast(context_); + CUBlas>::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, + rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C, + rocblas_datatype_f32_c, N, rocblas_datatype_f32_c); +} template <> template <> @@ -523,6 +662,45 @@ inline void Blas::GEMM( rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C, rocblas_datatype_f64_c, N, rocblas_datatype_f64_c); } +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::complex alpha, + const platform::complex *A, + const platform::complex *B, + platform::complex beta, + platform::complex *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 53, + platform::errors::InvalidArgument( + "cublas complex128 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + thrust::complex c_alpha = + thrust::complex(alpha.real, alpha.imag); + thrust::complex c_beta = + thrust::complex(beta.real, beta.imag); + + auto &cuda_ctx = const_cast(context_); + CUBlas>::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, + rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C, + rocblas_datatype_f64_c, N, rocblas_datatype_f64_c); +} template <> template @@ -541,6 +719,22 @@ void Blas::GEMM(bool transA, bool transB, int M, &beta, C, ldc); }); } +template <> +template +void Blas::GEMM(bool transA, bool transB, int M, int N, int K, + T alpha, const T *A, int lda, const T *B, + int ldb, T beta, T *C, int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + rocblas_operation cuTransA = + transA ? rocblas_operation_transpose : rocblas_operation_none; + rocblas_operation cuTransB = + transB ? rocblas_operation_transpose : rocblas_operation_none; + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, + &beta, C, ldc); + }); +} template <> template <> @@ -560,6 +754,26 @@ inline void Blas::GEMM( B, ldb, A, lda, &beta, C, ldc); }); } +template <> +template <> +inline void Blas::GEMM(bool transA, bool transB, int M, int N, + int K, platform::float16 alpha, + const platform::float16 *A, int lda, + const platform::float16 *B, int ldb, + platform::float16 beta, + platform::float16 *C, int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + rocblas_operation cuTransA = + transA ? rocblas_operation_transpose : rocblas_operation_none; + rocblas_operation cuTransB = + transB ? rocblas_operation_transpose : rocblas_operation_none; + + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, ldc); + }); +} template <> template @@ -569,6 +783,13 @@ void Blas::AXPY(int n, T alpha, const T *x, CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); }); } +template <> +template +void Blas::AXPY(int n, T alpha, const T *x, T *y) const { + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); + }); +} template <> template @@ -576,6 +797,12 @@ void Blas::SCAL(int n, const T alpha, T *x) const { context_.CublasCall( [&](rocblas_handle handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); } +template <> +template +void Blas::SCAL(int n, const T alpha, T *x) const { + context_.CublasCall( + [&](rocblas_handle handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); +} template <> template @@ -583,6 +810,12 @@ void Blas::VCOPY(int n, const T *x, T *y) const { context_.CublasCall( [&](rocblas_handle handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); } +template <> +template +void Blas::VCOPY(int n, const T *x, T *y) const { + context_.CublasCall( + [&](rocblas_handle handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); +} template <> template @@ -596,6 +829,17 @@ void Blas::GEMV(bool trans_a, int M, int N, CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); }); } +template <> +template +void Blas::GEMV(bool trans_a, int M, int N, T alpha, + const T *A, const T *B, T beta, T *C) const { + rocblas_operation cuTransA = + !trans_a ? rocblas_operation_transpose : rocblas_operation_none; + + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); +} template <> template <> @@ -612,6 +856,23 @@ inline void Blas::GEMV( alpha, A, B, beta, C); } } +template <> +template <> +inline void Blas::GEMV(bool trans_a, int M, int N, + platform::float16 alpha, + const platform::float16 *A, + const platform::float16 *B, + platform::float16 beta, + platform::float16 *C) const { + // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} template <> template @@ -637,6 +898,32 @@ void Blas::BatchedGEMM( ldc, strideC, batchCount); }); } +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, T alpha, const T *A, const T *B, + T beta, T *C, int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + const int64_t strideC = M * N; + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, strideB, A, lda, strideA, &beta, C, + ldc, strideC, batchCount); + }); +} template <> template @@ -648,6 +935,18 @@ void Blas::BatchedGEMM( C[k]); } } +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, T alpha, const T **A, + const T **B, T beta, T **C, + int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, A[k], B[k], beta, + C[k]); + } +} template <> template <> @@ -661,6 +960,18 @@ inline void Blas::BatchedGEMM( B[k], beta, C[k]); } } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::float16 alpha, const platform::float16 **A, + const platform::float16 **B, platform::float16 beta, platform::float16 **C, + int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, A[k], + B[k], beta, C[k]); + } +} template <> template @@ -687,6 +998,30 @@ void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, lda, B, ldb); }); } +template <> +template +void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, + CBLAS_TRANSPOSE transA, CBLAS_DIAG diag, + int M, int N, T alpha, const T *A, int lda, + T *B, int ldb) const { + // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` + // where ' stands for transpose + rocblas_side cuSide = + (side == CblasLeft) ? rocblas_side_right : rocblas_side_left; + rocblas_fill cuUplo = + (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower; + // use CUBLAS_OP_C (conjugate transpose) for complex + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_diagonal cuDiag = + (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit; + + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, + lda, B, ldb); + }); +} template <> template @@ -697,6 +1032,14 @@ void Blas::BatchedGETRF(int n, T **a, int *ipiv, CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); }); } +template <> +template +void Blas::BatchedGETRF(int n, T **a, int *ipiv, int *info, + int batch_size) const { + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); + }); +} template <> template @@ -715,6 +1058,22 @@ void Blas::BatchedGETRI(int n, const T **a, CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); }); } +template <> +template +void Blas::BatchedGETRI(int n, const T **a, const int *ipiv, + T **a_inv, int *info, + int batch_size) const { + PADDLE_ENFORCE_NE( + a_inv, a, + platform::errors::InvalidArgument( + "cuBLAS fuction 'cublasgetrfBatched' cannot be executed " + "in-place. The memory space of output matrix (address: %p) cannot " + "overlap memory space of input matrix (address: %p).", + a_inv, a)); + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); + }); +} template <> template @@ -725,6 +1084,14 @@ void Blas::BatchedMatInv(int n, const T **a, CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); }); } +template <> +template +void Blas::BatchedMatInv(int n, const T **a, T **a_inv, + int *info, int batch_size) const { + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); + }); +} template <> template @@ -739,6 +1106,20 @@ void Blas::BatchedGETRS( batch_size); }); } +template <> +template +void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, int n, + int nrhs, const T **a, int lda, + int *ipiv, T **b, int ldb, int *info, + int batch_size) const { + rocblas_operation cuTrans = (trans == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, + batch_size); + }); +} template <> template @@ -764,6 +1145,31 @@ void Blas::BatchedTRSM( &alpha, A, lda, B, ldb, batch_size); }); } +template <> +template +void Blas::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, + CBLAS_TRANSPOSE transA, + CBLAS_DIAG diag, int M, int N, T alpha, + const T **A, int lda, T **B, int ldb, + int batch_size) const { + // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` + // where ' stands for transpose + rocblas_side cuSide = + (side == CblasLeft) ? rocblas_side_right : rocblas_side_left; + rocblas_fill cuUplo = + (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower; + // use CUBLAS_OP_C (conjugate transpose) for complex + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_diagonal cuDiag = + (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit; + + context_.CublasCall([&](rocblas_handle handle) { + CUBlas::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, + &alpha, A, lda, B, ldb, batch_size); + }); +} } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index c8e2acea451a4..65e48e5817574 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -16,6 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" /** * case 1: @@ -441,6 +443,31 @@ void TestConcatMain() { delete context; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template <> +void TestConcatMain() { + auto* context = + new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); + context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPlace(), context->stream()) + .get()); + context->PartialInitWithAllocator(); + + ConcatCase1( + context); + ConcatCase2( + context); + ConcatCase3( + context); + ConcatCase4( + context); + + delete context; +} +#endif + TEST(math, concat) { TestConcatMain(); diff --git a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc index d45b57420eef1..a84b734fbeb2d 100644 --- a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc +++ b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc @@ -24,6 +24,11 @@ void TestNNZ(const std::vector& dense_data, const int correct_nnz, const int rows, const int cols) { paddle::platform::CUDADeviceContext* context = new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); + context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPlace(), context->stream()) + .get()); + context->PartialInitWithAllocator(); auto sparse = paddle::operators::math::GetSparse(*context); @@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector& correct_dense_data, const std::string& mode) { paddle::platform::CUDADeviceContext* context = new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); + context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPlace(), context->stream()) + .get()); + context->PartialInitWithAllocator(); // get sparse auto sparse = paddle::operators::math::GetSparse #include "paddle/fluid/operators/math/im2col_cfo_cpu.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" template void testIm2col() { @@ -60,6 +62,7 @@ void testIm2col() { auto* place = new Place(); DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { @@ -164,6 +167,165 @@ void testIm2col() { delete context; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template <> +void testIm2col() { + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor input; + paddle::framework::Tensor output_cfo; + paddle::framework::Tensor output_ocf; + paddle::framework::Tensor output_tmp; + + /** + * input = [0, 1, 2, + * 3, 4, 5] + * + * output_cfo = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5] + * + * output_ocf = [0, 1, 3, 4 + * 1, 2, 4, 5] + * + * col2im_cfo = [0, 2, 2 + * 3, 4, 5] + * + * col2im_ocf = [0, 2, 2 + * 3, 4, 5] + */ + int input_height = 2; + int input_width = 3; + int filter_size = 2; + std::vector stride({1, 1}); // stride_y, stride_x + std::vector padding( + {0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad + std::vector dilation({1, 1}); // dilation_y, dilation_x + int output_height = + (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1; + int output_width = + (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1; + float* input_ptr = input_tmp.mutable_data( + {1, input_height, input_width}, paddle::platform::CPUPlace()); + float arr[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input_ptr, arr, 6 * sizeof(float)); + + auto* place = new paddle::platform::CUDAPlace(); + auto* context = new paddle::platform::CUDADeviceContext(*place); + context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(*place, context->stream()) + .get()); + context->PartialInitWithAllocator(); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + paddle::framework::TensorCopySync(input_tmp, *place, &input); + } + output_cfo.mutable_data( + {1, filter_size, filter_size, output_height, output_width}, *place); + output_ocf.mutable_data( + {output_height, output_width, 1, filter_size, filter_size}, *place); + + // Im2Col + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kCFO, + paddle::platform::CUDADeviceContext, float> + im2col; + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kOCF, + paddle::platform::CUDADeviceContext, float> + im2col_ocf; + + im2col(*context, input, dilation, stride, padding, &output_cfo); + im2col_ocf(*context, input, dilation, stride, padding, &output_ocf); + + float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5}; + float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5}; + + float* out_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_cfo_ptr = output_cfo.data(); + } else { + paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(), + &output_tmp); + out_cfo_ptr = output_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]); + } + + float* out_ocf_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_ocf_ptr = output_ocf.data(); + } else { + paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(), + &output_tmp); + out_ocf_ptr = output_tmp.data(); + } + + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]); + } + + // Col2Im: kCFO + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kCFO, + paddle::platform::CUDADeviceContext, float> + col2im; + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kOCF, + paddle::platform::CUDADeviceContext, float> + col2im_ocf; + float col2im_data[] = {0, 2, 2, 3, 8, 5}; + + memset(input_ptr, 0, 6 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + paddle::framework::TensorCopySync(input_tmp, *place, &input); + } + + col2im(*context, output_cfo, dilation, stride, padding, &input); + + float* in_ptr; + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(), + &input_tmp); + in_ptr = input_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(in_ptr[i], col2im_data[i]); + } + + // Col2Im: kOCF + memset(input_ptr, 0, 6 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + paddle::framework::TensorCopySync(input_tmp, *place, &input); + } + + col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); + + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(), + &input_tmp); + in_ptr = input_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(in_ptr[i], col2im_data[i]); + } + + delete place; + delete context; +} +#endif + TEST(math, im2col) { testIm2col(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 54a37db1df71a..5fdc2889a8885 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, constexpr size_t kThreadNumY = 32; size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY; - grid_dim = std::min(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x); + grid_dim = std::min(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]); dim3 thread_dims(kThreadNumX, kThreadNumY); if (reverse) { InclusiveScanInnerDimCUDAKernel< diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 9ade45ee743a3..f0ef692b99f57 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/kernels/funcs/eigen/common.h" namespace paddle { @@ -44,6 +45,18 @@ template struct SetConstant>; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; + template struct SetConstant; template struct SetConstant(paddle::platform::CPUPlace()); auto* ctx = new paddle::platform::CPUDeviceContext(); + ctx->Init(); paddle::operators::math::set_constant(*ctx, &t, 10); for (int64_t i = 0; i < t.numel(); ++i) { PADDLE_ENFORCE_EQ(10, t.data()[i], diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 44b1ee45a4fe9..39c91e96a70fa 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; @@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { @@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; @@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { @@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); int m = 2; int n = 3; @@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { @@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); int m = 2; int n = 3; @@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { @@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDADeviceContext context(gpu_place); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, context.stream()) + .get()); + context.PartialInitWithAllocator(); T* data_a = mat_a.mutable_data({m, n}, cpu_place); T* data_b = vec_b.mutable_data({trans ? m : n}, cpu_place); diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index 8cd2824465879..210cf10d8879d 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" #include +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" template void testVol2col() { @@ -25,7 +27,6 @@ void testVol2col() { auto* place = new Place(); DeviceContext* context = new DeviceContext(*place); - /** * input = [[0, 1, 2, * 3, 4, 5] @@ -123,6 +124,124 @@ void testVol2col() { delete context; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template <> +void testVol2col() { + paddle::framework::Tensor input; + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor output; + paddle::framework::Tensor output_tmp; + + auto* place = new paddle::platform::CUDAPlace(); + auto* context = new paddle::platform::CUDADeviceContext(*place); + context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(*place, context->stream()) + .get()); + context->PartialInitWithAllocator(); + + /** + * input = [[0, 1, 2, + * 3, 4, 5] + * [6, 7, 8, + * 9, 10, 11]] + * + * output = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5 + * 6, 7 + * 7, 8 + * 9, 10 + * 10, 11] + * + * col2vol = [[0, 2, 2, + * 3, 8, 5] + * [6, 14, 8, + * 9, 20, 11]] + * + */ + int input_depth = 2; + int input_height = 2; + int input_width = 3; + int filter_size = 2; + std::vector strides({1, 1, 1}); + std::vector paddings({0, 0, 0}); + std::vector dilations({1, 1, 1}); + int output_depth = + (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1; + int output_height = + (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1; + int output_width = + (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1; + + // Vol2Col test + float* input_ptr = + input_tmp.mutable_data({1, input_depth, input_height, input_width}, + paddle::platform::CPUPlace()); + float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + memcpy(input_ptr, arr, 12 * sizeof(float)); + + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + paddle::framework::TensorCopySync(input_tmp, *place, &input); + } + output.mutable_data({1, filter_size, filter_size, filter_size, + output_depth, output_height, output_width}, + *place); + + paddle::operators::math::Vol2ColFunctor + vol2col; + vol2col(*context, input, dilations, strides, paddings, &output); + + float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; + float* out_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_cfo_ptr = output.data(); + } else { + paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(), + &output_tmp); + out_cfo_ptr = output_tmp.data(); + } + + for (int i = 0; i < 16; ++i) { + EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]); + } + + // Col2Vol test + float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11}; + memset(input_ptr, 0, 12 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + paddle::framework::TensorCopySync(input_tmp, *place, &input); + } + + paddle::operators::math::Col2VolFunctor + col2vol; + col2vol(*context, output, dilations, strides, paddings, &input); + + float* in_ptr; + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(), + &input_tmp); + in_ptr = input_tmp.data(); + } + + for (int i = 0; i < 12; ++i) { + EXPECT_EQ(in_ptr[i], col_2_vol[i]); + } + + delete place; + delete context; +} +#endif + TEST(math, vol2col) { testVol2col(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index 63b5b871aabb5..db8f586297c54 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel { reduce_dims.push_back(i); } TensorReduceFunctorImpl( - *input, output, Div(numel), reduce_dims, stream); + context.cuda_device_context(), *input, output, Div(numel), reduce_dims, + stream); } }; diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc index 41c1b4d7a8f81..9fcbfa90230c5 100644 --- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc @@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test { paddle::platform::CPUPlace cpu_place; for (size_t i = 0; i < gpu_list_.size(); ++i) { p::CUDAPlace place(i); - dev_ctxs_.emplace_back(new p::CUDADeviceContext(place)); + auto *ctx = new p::CUDADeviceContext(place); + ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx->stream()) + .get()); + ctx->PartialInitWithAllocator(); + dev_ctxs_.emplace_back(ctx); } NCCLInitOp(); diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index c0bd906685d4d..e11fe478106f9 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel { using MT = typename details::MPTypeTrait::Type; if (porder == 0) { TensorReduceFunctorImpl>( - *in_x, out_norm, NonzeroFunctor(), reduce_axis, stream); + ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor(), + reduce_axis, stream); } else if (porder == INFINITY) { TensorReduceFunctorImpl>( - *in_x, out_norm, AbsFunctor(), reduce_axis, stream); + ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor(), + reduce_axis, stream); } else if (porder == -INFINITY) { TensorReduceFunctorImpl>( - *in_x, out_norm, AbsFunctor(), reduce_axis, stream); + ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor(), + reduce_axis, stream); } else { TensorReduceFunctorImpl>( - *in_x, out_norm, UnsignedPowFunctor(porder), reduce_axis, stream); + ctx.cuda_device_context(), *in_x, out_norm, + UnsignedPowFunctor(porder), reduce_axis, stream); const framework::Tensor* tmp_norm = out_norm; std::vector ins = {tmp_norm}; diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index 84c1988e29b07..f7f60e82216ae 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel { auto stream = dev_ctx.stream(); TensorReduceFunctorImpl>( - *in_x, out, kps::DivideFunctor(reduce_num), reduce_dim, - stream); + dev_ctx, *in_x, out, kps::DivideFunctor(reduce_num), + reduce_dim, stream); #else // for cpu paddle::operators::math::Pool2dFunctor< DeviceContext, paddle::operators::math::AvgPool, T> diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu index c6997603bb13b..9493b6d4391d5 100644 --- a/paddle/fluid/operators/prelu_op.cu +++ b/paddle/fluid/operators/prelu_op.cu @@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel { } TensorReduceFunctorImpl>( - dalpha_tmp, dalpha, kps::IdentityFunctor(), reduce_dims, stream); + context.cuda_device_context(), dalpha_tmp, dalpha, + kps::IdentityFunctor(), reduce_dims, stream); } }; diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index 21f21cdc95606..2c701bdae7601 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) { framework::Scope* scope = (*micro_scope)[0]; platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); framework::Executor exe(place); // create var on local scope diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 730a84da88b59..f741c5941eb26 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -39,14 +39,16 @@ namespace operators { template class ReduceOp, typename TransformOp> -void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, +void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor& x, framework::Tensor* y, const TransformOp& transform, const std::vector& origin_reduce_dims, gpuStream_t stream) { y->mutable_data(x.place()); pten::kernels::TensorReduceFunctorImpl( - x, y, transform, origin_reduce_dims, stream); + static_cast(dev_ctx), x, y, transform, + origin_reduce_dims, stream); } } // namespace operators diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu index 1d76eaf27e818..ca6169d0410f1 100644 --- a/paddle/fluid/operators/renorm_op.cu +++ b/paddle/fluid/operators/renorm_op.cu @@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel { cuda_ctx, ins, &outs, func); std::vector reduce_axis = {0, 2}; TensorReduceFunctorImpl>( - pow_value, &dim_value, kps::IdentityFunctor(), reduce_axis, stream); + cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor(), reduce_axis, + stream); RenormKernelFunc3<<>>( numel, dim_value.mutable_data(context.GetPlace()), p, max_norm); RenormKernelFunc4<<>>( @@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel { dim_divisor); std::vector reduce_axis = {0, 2}; TensorReduceFunctorImpl>( - pow_value, &dim_value, kps::IdentityFunctor(), reduce_axis, stream); + ctx.cuda_device_context(), pow_value, &dim_value, + kps::IdentityFunctor(), reduce_axis, stream); TensorReduceFunctorImpl>( - mul_value, &weight_derivative, kps::IdentityFunctor(), reduce_axis, - stream); + ctx.cuda_device_context(), mul_value, &weight_derivative, + kps::IdentityFunctor(), reduce_axis, stream); RenormGradKernelFunc2<<>>( x_data, dout_data, dx_data, numel, dim_value.mutable_data(ctx.GetPlace()), diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index d54d5234010ba..99e40de3080e3 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -389,7 +389,8 @@ class ReshapeKernel { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out); + pten::ReshapeKernel(static_cast(dev_ctx), *in, + pt_scalar_shape, out); } #endif #ifdef PADDLE_WITH_XPU @@ -417,7 +418,8 @@ class ReshapeGradKernel { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeGradKernel(dev_ctx, *d_out, d_x); + pten::ReshapeGradKernel(static_cast(dev_ctx), + *d_out, d_x); } #endif #ifdef PADDLE_WITH_XPU @@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out); + pten::ReshapeDoubleGradKernel( + static_cast(dev_ctx), *dd_x, dd_out); } #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index 98311ff404b47..13c08aea68849 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index, int64_t max_grid_dimx = reinterpret_cast(ctx) - .GetCUDAMaxGridDimSize() - .x; + .GetCUDAMaxGridDimSize()[0]; int64_t grid = height < max_grid_dimx ? height : max_grid_dimx; ScatterInitCUDAKernel<<< diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h index 0acef78484cd3..ea3a5aa5af9b2 100644 --- a/paddle/fluid/operators/solve_op.h +++ b/paddle/fluid/operators/solve_op.h @@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output, #if defined(__NVCC__) || defined(__HIPCC__) auto stream = ctx.cuda_device_context().stream(); TensorReduceFunctorImpl>( - *input, output, kps::IdentityFunctor(), reduce_dims, stream); + ctx.cuda_device_context(), *input, output, kps::IdentityFunctor(), + reduce_dims, stream); #else ReduceKernelFunctor( input, output, reduce_dims, keep_dim, false, ctx) diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc index 1ab036e869294..514bdac9c55f1 100644 --- a/paddle/fluid/operators/strided_memcpy_test.cc +++ b/paddle/fluid/operators/strided_memcpy_test.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/strided_memcpy.h" #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" namespace paddle { namespace operators { @@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) { platform::CPUPlace cpu; platform::CUDADeviceContext ctx(gpu0); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu0, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); auto src_allocation = memory::Alloc(gpu0, sizeof(src)); @@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) { platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; platform::CUDADeviceContext ctx(gpu0); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu0, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src)); int* gpu_src = reinterpret_cast(gpu_src_allocation->ptr()); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index d2d04a4fa50b4..90d489c8df733 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name, tensor->Resize(dims); platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); inference::tensorrt::RandomizeTensor(tensor, place, ctx); } @@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) { framework::Scope scope; platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); // Prepare variables. if (allow_build_at_runtime) CreateCUDATensor(&scope, "x", std::vector({3, 4, 1, 1})); @@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { framework::Scope scope; platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); auto* block_ = program.Proto()->add_blocks(); block_->set_idx(0); diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index 05ae5c9188ceb..c077411e49681 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx, }; int block_size = ComputeBlockSize(num_cols); - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; + unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; // actually, int num_rows < max_grid_size unsigned int grid_size = num_rows < maxGridDimX ? static_cast(num_rows) diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu index 98a77637f9204..0b9e615eece8f 100644 --- a/paddle/fluid/operators/trace_op.cu +++ b/paddle/fluid/operators/trace_op.cu @@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel { std::vector reduce_dims; reduce_dims.push_back(out->dims().size()); TensorReduceFunctorImpl>( - diag, out, kps::IdentityFunctor(), reduce_dims, stream); + context.cuda_device_context(), diag, out, kps::IdentityFunctor(), + reduce_dims, stream); } else { math::SetConstant functor; functor(context.device_context(), out, static_cast(0)); diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu index b7ea5cd953186..28cdc56e2aec1 100644 --- a/paddle/fluid/operators/triangular_solve_op.cu +++ b/paddle/fluid/operators/triangular_solve_op.cu @@ -45,7 +45,8 @@ class MatrixReduceSumFunctor { } gpuStream_t stream = ctx.cuda_device_context().stream(); TensorReduceFunctorImpl>( - in, out, kps::IdentityFunctor(), out_reduce_dims, stream); + ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), + out_reduce_dims, stream); } }; diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu index 7ad3335009b06..b52f46e4a8ac6 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ b/paddle/fluid/operators/viterbi_decode_op.cu @@ -148,7 +148,7 @@ struct Argmax { } const auto& dev_ctx = ctx.cuda_device_context(); auto cu_stream = dev_ctx.stream(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; + int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int64_t height = pre * post; int64_t width = n; int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index e35b586dc90b0..07d3e41958293 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -138,6 +138,7 @@ if(WITH_CNCL) endif() if(WITH_GPU OR WITH_ROCM) + target_link_libraries(device_context gpu_info gpu_context pten_gpu_info) target_link_libraries(device_context gpu_resource_pool) endif() diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu index dbbb72920a53b..a82043cd7c435 100644 --- a/paddle/fluid/platform/bfloat16_test.cu +++ b/paddle/fluid/platform/bfloat16_test.cu @@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) { // CPU LoDTensor to GPU LoDTensor CUDAPlace gpu_place(0); CUDADeviceContext gpu_ctx(gpu_place); + gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, gpu_ctx.stream()) + .get()); + gpu_ctx.PartialInitWithAllocator(); framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor); // GPU LoDTensor to CPU LoDTensor diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index dd2dc9a40799e..ae1df10c45f77 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/platform/collective_helper.h" #include +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" @@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) { std::unique_ptr dev_ctx( new CUDADeviceContext(CUDAPlace(dev_id))); + dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(CUDAPlace(dev_id), dev_ctx->stream()) + .get()); + dev_ctx->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(CUDAPlace(dev_id)) + .get()); + dev_ctx->PartialInitWithAllocator(); std::shared_ptr compute_event( platform::CudaEventResourcePool::Instance().New(dev_id)); @@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks, auto* dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::XPUPlace(dev_id))); - dev_ctx->set_bkcl_context(comm); + dev_ctx->SetBkclContext(comm); } return comm_map_[ring_id][dev_id].get(); diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt index 5cf2258204fda..00f0cc2ac92bf 100644 --- a/paddle/fluid/platform/device/gpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt @@ -1,12 +1,12 @@ IF(WITH_GPU) add_subdirectory(cuda) - nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda) + nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda) nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) ELSEIF(WITH_ROCM) add_subdirectory(rocm) - hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda) + hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda) hip_test(cuda_helper_test SRCS cuda_helper_test.cu) hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt index 5df1de1b00fac..8f7fd3dcbc03a 100644 --- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt @@ -1,4 +1,3 @@ -nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda) nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h index 3199af9c97520..ab7d474c1ac38 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h @@ -14,8 +14,10 @@ #pragma once +#include #include // NOLINT +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" @@ -96,8 +98,7 @@ class CublasHandleHolder { PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_)); } - template - inline void Call(Callback&& callback) const { + inline void Call(const std::function& callback) const { std::lock_guard guard(mtx_); callback(handle_); } diff --git a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h index 43da9bb1fb42d..cc2b7349d5017 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h @@ -14,11 +14,13 @@ limitations under the License. */ #pragma once +#include #include // NOLINT #include "paddle/fluid/platform/dynload/cusparse.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/backends/gpu/gpu_decls.h" namespace paddle { namespace platform { @@ -45,8 +47,8 @@ class CusparseHandleHolder { #endif } - template - inline void Call(Callback&& callback) const { + inline void Call( + const std::function& callback) const { std::lock_guard guard(mtx_); callback(handle_); } diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 6e6824ed7896f..59fb26e696e12 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include #include #include #include @@ -39,11 +40,12 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" +#include "paddle/pten/backends/gpu/gpu_info.h" + DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(enable_cublas_tensor_op_math); -DECLARE_string(selected_gpus); DECLARE_uint64(gpu_memory_limit_mb); constexpr static float fraction_reserve_gpu_memory = 0.05f; @@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f; USE_GPU_MEM_STAT; namespace paddle { namespace platform { -//! Get a list of device ids from environment variable or use all. -std::vector GetSelectedDevices() { - // use user specified GPUs in single-node multi-process mode. - std::vector devices; - if (!FLAGS_selected_gpus.empty()) { - auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); - for (auto id : devices_str) { - devices.push_back(atoi(id.c_str())); - } - } else { - int count = GetGPUDeviceCount(); - for (int i = 0; i < count; ++i) { - devices.push_back(i); - } - } - return devices; -} void GpuMemoryUsage(size_t *available, size_t *total) { size_t actual_available, actual_total; @@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) { return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr); } +int DnnVersion() { return pten::backends::gpu::DnnVersion(); } + +int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); } + +int GetGPUComputeCapability(int id) { + return pten::backends::gpu::GetGPUComputeCapability(id); +} + +int GetGPURuntimeVersion(int id) { + return pten::backends::gpu::GetGPURuntimeVersion(id); +} + +int GetGPUDriverVersion(int id) { + return pten::backends::gpu::GetGPUDriverVersion(id); +} + +bool TensorCoreAvailable() { + return pten::backends::gpu::TensorCoreAvailable(); +} + +int GetGPUMultiProcessors(int id) { + return pten::backends::gpu::GetGPUMultiProcessors(id); +} + +int GetGPUMaxThreadsPerMultiProcessor(int id) { + return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id); +} + +int GetGPUMaxThreadsPerBlock(int id) { + return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id); +} + +int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); } + +std::array GetGpuMaxGridDimSize(int id) { + return pten::backends::gpu::GetGpuMaxGridDimSize(id); +} + +std::vector GetSelectedDevices() { + return pten::backends::gpu::GetSelectedDevices(); +} + +const gpuDeviceProp &GetDeviceProperties(int id) { + return pten::backends::gpu::GetDeviceProperties(id); +} + +void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); } + +gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); } + +void GpuStreamSync(gpuStream_t stream) { + pten::backends::gpu::GpuStreamSync(stream); +} + +void GpuDestroyStream(gpuStream_t stream) { + pten::backends::gpu::GpuDestroyStream(stream); +} + +void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); } + +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + gpuMemcpyKind kind, gpuStream_t stream) { + pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream); +} + +void GpuMemcpySync(void *dst, const void *src, size_t count, + gpuMemcpyKind kind) { + pten::backends::gpu::GpuMemcpySync(dst, src, count, kind); +} + +void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, + int src_device, size_t count, gpuStream_t stream) { + pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device, + count, stream); +} + +void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, + int src_device, size_t count) { + pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device, + count); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { + pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream); +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index 1e072625ada1a..f6fb2ad8ce72e 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -14,6 +14,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include +#include #include #include @@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id); int GetCurrentDeviceId(); //! Get the maximum GridDim size for GPU buddy allocator. -dim3 GetGpuMaxGridDimSize(int); +std::array GetGpuMaxGridDimSize(int); //! Get a list of device ids from environment variable or use all. std::vector GetSelectedDevices(); @@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream); void GpuDestroyStream(gpuStream_t stream); // ! Blocks until device has completed all operations. -void GpuDeviceync(); +void GpuDeviceSync(); //! CudaMalloc with recorded info gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id); diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 261916b2555be..1d6ccdc1280a9 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -83,8 +83,21 @@ struct NCCLContext { std::unique_ptr ctx_; ncclComm_t comm_; - explicit NCCLContext(int dev_id) - : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} + explicit NCCLContext(int dev_id) : comm_{nullptr} { + ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id))); + ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(CUDAPlace(dev_id), ctx_->stream()) + .get()); + ctx_->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + ctx_->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(CUDAPlace(dev_id)) + .get()); + ctx_->PartialInitWithAllocator(); + } gpuStream_t stream() const { return ctx_->stream(); } ncclComm_t comm() const { return comm_; } diff --git a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt index 86b9ecd5f5445..988807258c123 100644 --- a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt @@ -1,3 +1 @@ -hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda) - hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index fdd9883c2c924..1e674258334b0 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_context.h" +#include #include #include +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" +#include "paddle/pten/backends/gpu/gpu_context.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" @@ -149,16 +153,17 @@ inline void EmplaceDeviceContext( cuda_ctx, platform::errors::InvalidArgument( "Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); - dev_ctx->SetDeviceAllocator( - memory::allocation::AllocatorFacade::Instance() - .GetAllocator(p, cuda_ctx->context()->RawStream()) - .get()); + // Note: A trick method to init context, why GetAllocator interface + // needs a stream parameter? + dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance() + .GetAllocator(p, cuda_ctx->stream()) + .get()); + cuda_ctx->PartialInitWithAllocator(); #endif } else { - dev_ctx->SetDeviceAllocator( - memory::allocation::AllocatorFacade::Instance() - .GetAllocator(p) - .get()); + dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance() + .GetAllocator(p) + .get()); } dev_ctx->SetHostAllocator( memory::allocation::AllocatorFacade::Instance() @@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool( } } -CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {} +CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() { + pten::CPUContext::Init(); +} -CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {} +CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) { + pten::CPUContext::Init(); +} #ifdef PADDLE_WITH_IPU IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {} -Place IPUDeviceContext::GetPlace() const { return place_; } +const Place& IPUDeviceContext::GetPlace() const { return place_; } void IPUDeviceContext::Wait() const { /*! \brief Wait for all operations completion in the stream. */ @@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {} #endif #ifdef PADDLE_WITH_XPU -XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {} +XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() { + pten::XPUContext::Init(); +} XPUDeviceContext::~XPUDeviceContext() {} XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) { + pten::XPUContext::Init(); LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " << static_cast(place.device); } @@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const { aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } -Place NPUDeviceContext::GetPlace() const { return place_; } +const Place& NPUDeviceContext::GetPlace() const { return place_; } aclrtContext NPUDeviceContext::context() const { return context_; } @@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const { return eigen_device_.get(); } -Place NPUPinnedDeviceContext::GetPlace() const { return place_; } +const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; } #endif @@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() { #endif } -CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { - CUDADeviceGuard guard(place_.device); - compute_capability_ = GetGPUComputeCapability(place_.device); - multi_process_ = GetGPUMultiProcessors(place_.device); - max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device); - max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device); - max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device); - - driver_version_ = GetGPUDriverVersion(place_.device); - runtime_version_ = GetGPURuntimeVersion(place_.device); - - LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " - << static_cast(place_.device) - << ", GPU Compute Capability: " - << compute_capability_ / 10 << "." - << compute_capability_ % 10 - << ", Driver API Version: " << driver_version_ / 1000 - << "." << (driver_version_ % 100) / 10 - << ", Runtime API Version: " - << runtime_version_ / 1000 << "." - << (runtime_version_ % 100) / 10; -#ifdef PADDLE_WITH_HIP - size_t version_major, version_minor, version_patch; - PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( - &version_major, &version_minor, &version_patch)); - LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place_.device) - << ", MIOpen Version: " << version_major << "." - << version_minor << "." << version_patch; -#else - size_t cudnn_dso_ver = dynload::cudnnGetVersion(); - LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place_.device) - << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." - << (cudnn_dso_ver % 1000) / 100 << "."; -#endif - { - // Check CUDA/CUDNN version compatiblity - auto local_cuda_version = - (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10; -#ifdef PADDLE_WITH_HIP - auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10); -#else - auto compile_cuda_version = - (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10; -#endif - if (local_cuda_version < compile_cuda_version) { - LOG_FIRST_N(WARNING, 1) - << "WARNING: device: " << static_cast(place_.device) - << ". The installed Paddle is compiled with CUDA " - << compile_cuda_version / 10 << "." << compile_cuda_version % 10 - << ", but CUDA runtime version in your machine is " - << local_cuda_version / 10 << "." << local_cuda_version % 10 - << ", which may cause serious incompatible bug. " - << "Please recompile or reinstall Paddle with compatible CUDA " - "version."; - } - } - default_ctx_.reset(new CUDAContext(place_)); -} - -CUDADeviceContext::~CUDADeviceContext() { - SetDeviceId(place_.device); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (nccl_comm_) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); - } -#endif -} - -Place CUDADeviceContext::GetPlace() const { return place_; } - -void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); } - -int CUDADeviceContext::GetComputeCapability() const { - return compute_capability_; -} - -int CUDADeviceContext::GetMaxPhysicalThreadCount() const { - return multi_process_ * max_threads_per_mp_; +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) + : pten::GPUContext(place) { + pten::GPUContext::PartialInitWithoutAllocator(); + cuda_stream_.reset( + new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace())); } -int CUDADeviceContext::GetSMCount() const { return multi_process_; } - -int CUDADeviceContext::GetMaxThreadsPerBlock() const { - return max_threads_per_block_; -} +CUDADeviceContext::~CUDADeviceContext() = default; Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { - return context()->EigenDevice().get(); -} - -bool CUDADeviceContext::tensor_core_available() const { - return context()->CublasTensorCoreHandle() != nullptr; + if (thread_ctx_.count(this)) { + return context()->EigenDevice().get(); + } + return pten::GPUContext::eigen_device(); } -dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const { - return max_grid_dim_size_; +void CUDADeviceContext::Wait() const { + if (thread_ctx_.count(this)) { + context()->Stream()->Wait(); + return; + } + pten::GPUContext::Wait(); } #ifdef PADDLE_WITH_HIP @@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const { #else cudnnHandle_t CUDADeviceContext::cudnn_handle() const { #endif - return context()->CudnnHandle(); + if (thread_ctx_.count(this)) { + return context()->CudnnHandle(); + } + return pten::GPUContext::cudnn_handle(); } #ifdef PADDLE_WITH_HIP rocblas_handle CUDADeviceContext::cublas_handle() const { - return context()->CublasHandle()->GetCublasHandle(); + if (thread_ctx_.count(this)) { + return context()->CublasHandle()->GetCublasHandle(); + } + return pten::GPUContext::cublas_handle(); } #else cublasHandle_t CUDADeviceContext::cublas_handle() const { - return context()->CublasHandle()->GetCublasHandle(); + if (thread_ctx_.count(this)) { + return context()->CublasHandle()->GetCublasHandle(); + } + return pten::GPUContext::cublas_handle(); } cusparseHandle_t CUDADeviceContext::cusparse_handle() const { - return context()->CusparseHandle()->GetCusparseHandle(); + if (thread_ctx_.count(this)) { + return context()->CusparseHandle()->GetCusparseHandle(); + } + return pten::GPUContext::cusparse_handle(); +} +cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const { + if (thread_ctx_.count(this)) { + return context()->CusolverDnHandle(); + } + return pten::GPUContext::cusolver_dn_handle(); } #endif +void CUDADeviceContext::RecordEvent( + gpuEvent_t ev, const std::function& callback) const { + if (thread_ctx_.count(this)) { + context()->Stream()->RecordEvent(ev, callback); + return; + } + pten::GPUContext::RecordEvent(ev, callback); +} + +void CUDADeviceContext::AddStreamCallback( + const std::function& callback) const { + if (thread_ctx_.count(this)) { + context()->Stream()->AddCallback(callback); + return; + } + pten::GPUContext::AddStreamCallback(callback); +} + +void CUDADeviceContext::WaitStreamCallback() const { + if (thread_ctx_.count(this)) { + context()->Stream()->WaitCallback(); + return; + } + pten::GPUContext::WaitStreamCallback(); +} + CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_); } -#ifndef PADDLE_WITH_HIP -cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const { - return context()->CusolverDnHandle(); +gpuStream_t CUDADeviceContext::stream() const { + if (thread_ctx_.count(this)) { + return context()->RawStream(); + } + return pten::GPUContext::stream(); } -#endif -gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); } +std::shared_ptr CUDADeviceContext::context() const { + if (!thread_ctx_.count(this)) { + PADDLE_THROW(platform::errors::PermissionDenied( + "CUDADeviceContext call context() failed, make sure in the " + "thread_local semantic.")); + } + return thread_ctx_.at(this); +} + +stream::CUDAStream* CUDADeviceContext::GetCudaStream() const { + return cuda_stream_.get(); +} + +stream::CUDAStream* CUDADeviceContext::SetCudaStream( + stream::CUDAStream* new_stream_ptr) { + auto* old_stream_ptr = cuda_stream_.release(); + cuda_stream_.reset(new_stream_ptr); + return old_stream_ptr; +} CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); @@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const { return eigen_device_.get(); } -Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } +const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; } #endif #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 17b22907b1532..4d469e92c04cd 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include // NOLINT #include #include // NOLINT @@ -18,7 +19,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/backends/gpu/gpu_decls.h" #include "paddle/pten/core/device_context.h" #include "paddle/fluid/memory/malloc.h" @@ -28,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cusolver.h" #include "paddle/fluid/platform/dynload/cusparse.h" +#include "paddle/pten/backends/gpu/gpu_context.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/dynload/nccl.h" #endif @@ -38,6 +42,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" +#include "paddle/pten/backends/gpu/gpu_context.h" // NOLINT #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/dynload/rccl.h" #endif @@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext { explicit IPUDeviceContext(IPUPlace place); virtual ~IPUDeviceContext(); Eigen::DefaultDevice* eigen_device() const { return nullptr; } - Place GetPlace() const override; + const Place& GetPlace() const override; /*! \brief Wait for all operations completion in the stream. */ void Wait() const override; @@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext { explicit NPUDeviceContext(NPUPlace place); virtual ~NPUDeviceContext(); Eigen::DefaultDevice* eigen_device() const { return nullptr; } - Place GetPlace() const override; + const Place& GetPlace() const override; aclrtContext context() const; /*! \brief Wait for all operations completion in the stream. */ @@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext { NPUPinnedDeviceContext(); explicit NPUPinnedDeviceContext(NPUPinnedPlace place); - Place GetPlace() const override; + const Place& GetPlace() const override; Eigen::DefaultDevice* eigen_device() const; @@ -326,20 +331,20 @@ class CUDAContext { #endif /*! \brief Call cublas function safely. */ - template - inline void CublasCall(Callback&& callback) const { + inline void CublasCall( + const std::function& callback) const { if (cublas_tf32_tensor_core_handle_) { - cublas_tf32_tensor_core_handle_->Call(std::forward(callback)); + cublas_tf32_tensor_core_handle_->Call(callback); } else { - cublas_handle_->Call(std::forward(callback)); + cublas_handle_->Call(callback); } } #ifndef PADDLE_WITH_HIP /*! \brief Call cusparse function safely. */ - template - inline void CusparseCall(Callback&& callback) const { - cusparse_handle_->Call(std::forward(callback)); + inline void CusparseCall( + const std::function& callback) const { + cusparse_handle_->Call(callback); } #endif @@ -348,12 +353,12 @@ class CUDAContext { /*! \brief Call cublas function with Tensor Core safely. If Tensor Core is not available, use DEFAULT_MATH instead. */ - template - inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { + inline void TensorCoreCublasCallIfAvailable( + const std::function& callback) const { if (cublas_tensor_core_handle_) { - cublas_tensor_core_handle_->Call(std::forward(callback)); + cublas_tensor_core_handle_->Call(callback); } else { - cublas_handle_->Call(std::forward(callback)); + cublas_handle_->Call(callback); } } @@ -491,7 +496,7 @@ class CUDAContext { DISABLE_COPY_AND_ASSIGN(CUDAContext); }; -class CUDADeviceContext : public DeviceContext { +class CUDADeviceContext : public pten::GPUContext { public: explicit CUDADeviceContext(CUDAPlace place); virtual ~CUDADeviceContext(); @@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Wait for all operations completion in the stream. */ void Wait() const override; - /*! \brief Return place in the device context. */ - Place GetPlace() const override; - - /*! \brief Return compute capability in the device context. */ - int GetComputeCapability() const; - - /*! \brief Return the max physical thread count in the device context */ - int GetMaxPhysicalThreadCount() const; - - /*! \brief Return the SM count in the device context */ - int GetSMCount() const; - - /*! \brief Return the Max thread num of block in the device context */ - int GetMaxThreadsPerBlock() const; - - /*! \brief Return the max grid dim size in the device context */ - dim3 GetCUDAMaxGridDimSize() const; - /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; /*! \brief Call cublas function safely. */ - template - inline void CublasCall(Callback&& callback) const { + inline void CublasCall( + const std::function& callback) const { + if (!thread_ctx_.count(this)) { + pten::GPUContext::CublasCall(callback); + return; + } return context()->CublasCall(callback); } #ifndef PADDLE_WITH_HIP /*! \brief Call cusparse function safely. */ - template - inline void CusparseCall(Callback&& callback) const { - return context()->CusparseCall(callback); + inline void CusparseCall( + const std::function& callback) const { + if (!thread_ctx_.count(this)) { + pten::GPUContext::CusparseCall(callback); + return; + } + context()->CusparseCall(callback); } #endif - /*! \brief Check whether tensor core is supported */ - bool tensor_core_available() const; - /*! \brief Call cublas function with Tensor Core safely. If Tensor Core is not available, use DEFAULT_MATH instead. */ - template - inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { - return context()->TensorCoreCublasCallIfAvailable(callback); + inline void TensorCoreCublasCallIfAvailable( + const std::function& callback) const { + if (!thread_ctx_.count(this)) { + pten::GPUContext::TensorCoreCublasCallIfAvailable(callback); + return; + } + context()->TensorCoreCublasCallIfAvailable(callback); } /*! \brief Return cudnn handle in the device context. */ @@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext { cusparseHandle_t cusparse_handle() const; #endif +#ifndef PADDLE_WITH_HIP + cusolverDnHandle_t cusolver_dn_handle() const; +#endif + /*! \brief Return a cudnn workspace handle to call multiple cudnn * functions without interrupting by other threads. * Once the first cudnn function is called by the handle, a lock @@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext { * sequential cudnn function calls. */ CudnnWorkspaceHandle cudnn_workspace_handle() const; -#ifndef PADDLE_WITH_HIP - cusolverDnHandle_t cusolver_dn_handle() const; -#endif - /*! \brief Return cuda stream in the device context. */ gpuStream_t stream() const; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - /*! \brief Return nccl communicators. */ - ncclComm_t nccl_comm() const { return nccl_comm_; } - - /*! \brief Set nccl communicators. */ - void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; } -#endif - - template - void RecordEvent(gpuEvent_t ev, Callback callback) const { - return context()->Stream()->RecordEvent(ev, callback); - } - - template - void AddStreamCallback(Callback&& callback) const { - return context()->Stream()->AddCallback(callback); - } + void RecordEvent(gpuEvent_t ev, const std::function& callback) const; - void WaitStreamCallback() const { - return context()->Stream()->WaitCallback(); - } + void AddStreamCallback(const std::function& callback) const; - void ResetDefaultContext(const stream::Priority& priority) { - default_ctx_.reset(new CUDAContext(place_, priority)); - } + void WaitStreamCallback() const; void ResetThreadContext(const stream::Priority& priority) { std::lock_guard guard(ctx_mtx_); - thread_ctx_[this].reset(new CUDAContext(place_, priority)); + thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority)); } - std::shared_ptr context() const { - if (!thread_ctx_.count(this)) { - return default_ctx_; - } - return thread_ctx_.at(this); - } + std::shared_ptr context() const; // Note: Can only be used under thread_local semantics. void SetThreadLocalStream(const gpuStream_t stream) { thread_ctx_.at(this)->SetStream(stream); } - private: - CUDAPlace place_; - std::shared_ptr default_ctx_; + // NOTE: Just for compatibility with the past, please delete if there is an + // elegant way. + stream::CUDAStream* GetCudaStream() const; + stream::CUDAStream* SetCudaStream(stream::CUDAStream*); + private: // The thread_local static variable will be released before the // global static variable, so avoid using it in dtor. static thread_local std::unordered_map cuda_stream_; DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); }; @@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext { CUDAPinnedDeviceContext(); explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place); - Place GetPlace() const override; + const Place& GetPlace() const override; Eigen::DefaultDevice* eigen_device() const; diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index cf617a478eb71..851c756b665b8 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" TEST(Device, Init) { using paddle::platform::DeviceContext; @@ -26,6 +27,20 @@ TEST(Device, Init) { int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + device_context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(CUDAPlace(i), device_context->stream()) + .get()); + device_context->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + device_context->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(CUDAPlace(i)) + .get()); + device_context->PartialInitWithAllocator(); + Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); delete device_context; @@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) { int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + device_context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(CUDAPlace(i), device_context->stream()) + .get()); + device_context->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + device_context->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(CUDAPlace(i)) + .get()); + device_context->PartialInitWithAllocator(); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index 0a6b3917fbc21..a811a5b9c130d 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) { platform::errors::PreconditionNotMet( "Failed to dynamic_cast context into CUDADeviceContext.")); - wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream()); + wrapper->inner_event_.Record(cuda_dev_ctx->stream()); } bool DeviceEventQueryCUDA(const DeviceEvent* event) { @@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event, platform::errors::PreconditionNotMet( "Failed to dynamic_cast context into CUDADeviceContext.")); // calling cudaStreamWaitEvent(stream, event, 0) - cuda_dev_ctx->context()->Stream()->WaitEvent( - wrapper->inner_event_.GetRawCudaEvent()); + cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent()); } void DeviceEventCPUWaitCUDA(const DeviceEvent* event, diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc index a56d94b892e98..96e89f9257dd2 100644 --- a/paddle/fluid/platform/device_event_test.cc +++ b/paddle/fluid/platform/device_event_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/platform/device_event.h" #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/platform/place.h" using ::paddle::platform::kCUDA; using ::paddle::platform::kCPU; @@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) { // case 1. test for event_creator DeviceEvent event(place); ASSERT_NE(event.GetEvent().get(), nullptr); + bool status = event.Query(); + ASSERT_EQ(status, true); // case 2. test for event_recorder event.Record(context); - bool status = event.Query(); + status = event.Query(); ASSERT_EQ(status, false); // case 3. test for event_finisher event.Finish(); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index c751ee1e69b2b..7de5055431972 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) { ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ - retry_sleep(FLAGS_gpu_allocator_retry_time); \ + paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \ __cond__ = (COND); \ ++retry_count; \ } \ @@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) { ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ - retry_sleep(FLAGS_gpu_allocator_retry_time); \ + ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \ __cond__ = (COND); \ ++retry_count; \ } \ diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index da5080cc86f0c..86df34acd7612 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -152,11 +152,11 @@ class CudaEvent { #endif } - void Record(const paddle::platform::stream::CUDAStream &stream) { + void Record(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream())); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream)); #else - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream())); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream)); #endif } diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 7a047d790abdc..cc5c11778aa30 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) { // CPU LoDTensor to GPU LoDTensor CUDAPlace gpu_place(0); CUDADeviceContext gpu_ctx(gpu_place); + gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu_place, gpu_ctx.stream()) + .get()); + gpu_ctx.PartialInitWithAllocator(); framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor); // GPU LoDTensor to CPU LoDTensor diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index 1d4be3801dda9..aef36a893156d 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/backends/gpu/gpu_context.h" namespace paddle { namespace platform { @@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) { } } +// NOTE: After the pten kernel is migrated, it needs to be deleted. template <> struct ForRange { ForRange(const CUDADeviceContext& dev_ctx, size_t limit) @@ -106,6 +108,40 @@ struct ForRange { size_t limit_; }; +template <> +struct ForRange { + ForRange(const pten::GPUContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} + + template + inline void operator()(Function func) const { +#ifdef __HIPCC__ + // HIP will throw core dump when threads > 256 + constexpr int num_threads = 256; +#elif WITH_NV_JETSON + // JETSON_NANO will throw core dump when threads > 128 + int num_thread = 256; + platform::ChangeThreadNum(dev_ctx_, &num_thread, 128); + const int num_threads = num_thread; +#else + constexpr int num_threads = 1024; +#endif + size_t block_size = limit_ <= num_threads ? limit_ : num_threads; + size_t grid_size = (limit_ + num_threads - 1) / num_threads; + + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); + } else { + ForRangeElemwiseOp<<>>( + func, limit_); + } + } + + const pten::GPUContext& dev_ctx_; + size_t limit_; +}; + #endif } // namespace platform diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc index 5697bbee0bb92..e3e735d03aba1 100644 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ b/paddle/fluid/platform/stream/cuda_stream.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" @@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) { platform::Place device = CUDAPlace(deviceId); - auto stream = static_cast(pool.Get(device)) - ->context() - ->Stream() - .get(); - return stream; + return static_cast(pool.Get(device)) + ->GetCudaStream(); #else PADDLE_THROW(platform::errors::Unavailable( "Paddle is not compiled with CUDA. Cannot visit cuda current stream.")); @@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) { auto& device = stream->GetPlace(); auto& pool = platform::DeviceContextPool::Instance(); return static_cast(pool.Get(device)) - ->context() - ->SetStream(stream); + ->SetCudaStream(stream); #else PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with CUDA. Cannot visit cuda current stream.")); - return nullptr; + "Paddle is not compiled with CUDA. Cannot visit cuda current" + "stream.")); + return CUDAStream(nullptr); #endif } } // namespace stream diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h index 0683cf4b0424e..2b54f0861f426 100644 --- a/paddle/fluid/platform/stream/cuda_stream.h +++ b/paddle/fluid/platform/stream/cuda_stream.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -51,24 +52,28 @@ class CUDAStream final { const StreamFlag& flag = StreamFlag::kDefaultFlag) { Init(place, priority, flag); } + explicit CUDAStream(gpuStream_t stream, const Place& place) + : place_(place), stream_(stream) { + owned_stream_ = false; + callback_manager_.reset(new StreamCallbackManager(stream_)); + } virtual ~CUDAStream() { Destroy(); } bool Init(const Place& place, const Priority& priority = Priority::kNormal, const StreamFlag& flag = StreamFlag::kDefaultFlag); - template - void AddCallback(Callback&& callback) const { + void AddCallback(std::function callback) const { callback_manager_->AddCallback(callback); } - template #ifdef PADDLE_WITH_HIP - void RecordEvent(hipEvent_t ev, Callback callback) const { + void RecordEvent(hipEvent_t ev, const std::function& callback) const { callback(); PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_)); } #else - void RecordEvent(cudaEvent_t ev, Callback callback) const { + void RecordEvent(cudaEvent_t ev, + const std::function& callback) const { callback(); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_)); } @@ -149,6 +154,7 @@ class CUDAStream final { }; CUDAStream* get_current_stream(int deviceId); +// NOTE: There is a problem with the interface and needs to be fixed CUDAStream* set_current_stream(CUDAStream* stream); } // namespace stream diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu index 32ec113d1f5e5..e26fd511aa9af 100644 --- a/paddle/fluid/platform/transform_test.cu +++ b/paddle/fluid/platform/transform_test.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/transform.h" @@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) { TEST(Transform, GPUUnary) { CUDAPlace gpu0(0); CUDADeviceContext ctx(gpu0); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu0, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4); float* gpu_buf = static_cast(gpu_allocation->ptr()); @@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) { int buf[4] = {1, 2, 3, 4}; CUDAPlace gpu0(0); CUDADeviceContext ctx(gpu0); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(gpu0, ctx.stream()) + .get()); + ctx.PartialInitWithAllocator(); auto gpu_allocation = Alloc(gpu0, sizeof(buf)); int* gpu_buf = static_cast(gpu_allocation->ptr()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index 21571e17a2b48..64c145c94f99d 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) { return paddle::platform::stream::get_current_stream(deviceId); #else PADDLE_THROW(platform::errors::Unavailable( - "Paddle is not compiled with CUDA. Cannot visit cuda current " + "Paddle is not compiled with CUDA. Cannot visit cuda current" "stream.")); #endif }, @@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) { [](paddle::platform::stream::CUDAStream &self, paddle::platform::stream::CUDAStream &stream) { paddle::platform::CudaEvent event; - event.Record(stream); + event.Record(stream.raw_stream()); self.WaitEvent(event.GetRawCudaEvent()); }, @@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) { if (event == nullptr) { event = new paddle::platform::CudaEvent(); } - event->Record(self); + event->Record(self.raw_stream()); return event; }, R"DOC( @@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) { if (stream == nullptr) { stream = paddle::platform::stream::get_current_stream(-1); } - self.Record(*stream); + self.Record(stream->raw_stream()); }, R"DOC( Records the event in the given stream. diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e8dbe41ebba3a..9a535f5fb0492 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle. .def_static("create", [](paddle::platform::CPUPlace& place) -> paddle::platform::DeviceContext* { - return new paddle::platform::CPUDeviceContext(); + auto* context = new paddle::platform::CPUDeviceContext(); + context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place) + .get()); + context->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + context->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(place) + .get()); + return context; }) .def_static("create", [](paddle::platform::XPUPlace& place) @@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle. "Cannot use XPUPlace in CPU/GPU version, " "Please recompile or reinstall Paddle with XPU support.")); #else - return new paddle::platform::XPUDeviceContext(place); + auto* context = new paddle::platform::XPUDeviceContext(place); + context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place) + .get()); + context->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + context->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(place) + .get()); + return context; #endif }) .def_static("create", @@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle. "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); #else - return new paddle::platform::CUDADeviceContext(place); + auto* context = new paddle::platform::CUDADeviceContext(place); + context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context->stream()) + .get()); + context->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + context->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(place) + .get()); + context->PartialInitWithAllocator(); + return context; #endif }) .def_static("create", diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt index e9f222d642ea0..cc9352892030a 100644 --- a/paddle/pten/backends/CMakeLists.txt +++ b/paddle/pten/backends/CMakeLists.txt @@ -2,6 +2,10 @@ add_subdirectory(dynload) add_subdirectory(cpu) +if(WITH_GPU OR WITH_ROCM) + add_subdirectory(gpu) +endif() + if(WITH_XPU) add_subdirectory(xpu) endif() @@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context) if(WITH_XPU) add_dependencies(pten_context xpu_context) endif() + +if(WITH_GPU) + add_dependencies(pten_context gpu_context) +endif() diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc index efce128596b81..4029c286a5b28 100644 --- a/paddle/pten/backends/cpu/cpu_context.cc +++ b/paddle/pten/backends/cpu/cpu_context.cc @@ -15,75 +15,59 @@ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/api/ext/exception.h" +#include "paddle/pten/common/place.h" // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. +#include "paddle/pten/core/device_context.h" #include "unsupported/Eigen/CXX11/Tensor" namespace pten { -struct CPUContext::CPUImpl { - CPUImpl() { device_ = new Eigen::DefaultDevice(); } +struct CPUContext::Impl { + Impl() : place_(CPUPlace()) {} - // Users need to manage external resources. - explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) { - device_ = res_.device; - } + explicit Impl(const Place& place) : place_(place) {} - ~CPUImpl() { - if (res_.device == nullptr && device_ != nullptr) { - delete device_; - device_ = nullptr; + ~Impl() { + if (owned_) { + delete eigen_device_; } } - Eigen::DefaultDevice* GetEigenDevice() const { - PD_CHECK(device_ != nullptr, "the eigen_device is nullptr."); - return device_; + void Init() { + owned_ = true; + eigen_device_ = new Eigen::DefaultDevice(); } - void SetEigenDevice(Eigen::DefaultDevice* device) { - if (device == nullptr) { - return; - } - res_.device = device; - device_ = device; + Eigen::DefaultDevice* GetEigenDevice() const { + PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr."); + return eigen_device_; } - Place GetPlace() const { return place_; } - - Eigen::DefaultDevice* device_{nullptr}; - CPUContextResource res_; - CPUPlace place_; + bool owned_{false}; + Eigen::DefaultDevice* eigen_device_{nullptr}; + Place place_; }; -CPUContext::CPUContext() : DeviceContext() { - cpu_impl_ = std::make_unique(); -} - -CPUContext::CPUContext(const CPUContext& other) : DeviceContext() { - cpu_impl_ = std::make_unique(); - cpu_impl_->SetEigenDevice(other.eigen_device()); -} +CPUContext::CPUContext() + : DeviceContext(), impl_(std::make_unique()) {} -CPUContext::CPUContext(CPUContext&& other) : DeviceContext() { - cpu_impl_ = std::move(other.cpu_impl_); -} +CPUContext::CPUContext(const Place& place) + : DeviceContext(), impl_(std::make_unique(place)) {} CPUContext::~CPUContext() = default; -CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() { - cpu_impl_ = std::make_unique(ctx_res); -} +void CPUContext::Init() { impl_->Init(); } Eigen::DefaultDevice* CPUContext::eigen_device() const { - return cpu_impl_->GetEigenDevice(); + return impl_->GetEigenDevice(); } +const Place& CPUContext::GetPlace() const { return impl_->place_; } + void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) { - cpu_impl_->SetEigenDevice(device); + impl_->eigen_device_ = device; } -Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); } - } // namespace pten diff --git a/paddle/pten/backends/cpu/cpu_context.h b/paddle/pten/backends/cpu/cpu_context.h index 059588dc712ea..dca87a786b961 100644 --- a/paddle/pten/backends/cpu/cpu_context.h +++ b/paddle/pten/backends/cpu/cpu_context.h @@ -24,37 +24,29 @@ limitations under the License. */ namespace pten { -struct CPUContextResource { - Eigen::DefaultDevice* device{nullptr}; -}; - class CPUContext : public DeviceContext { public: - // NOTE: DeviceContext hold resources. Used in training scenarios. CPUContext(); - - // NOTE: Share the same underlying resources, please ensure that resources are - // not released. - CPUContext(const CPUContext&); - - CPUContext(CPUContext&&); - - ~CPUContext(); - + explicit CPUContext(const Place&); + virtual ~CPUContext(); Eigen::DefaultDevice* eigen_device() const; - - // TODO(wilber): Whether the interface should be preserved. - Place GetPlace() const override; + const Place& GetPlace() const override; public: - // NOTE: External users manage resources. Used in inference scenarios. - explicit CPUContext(const CPUContextResource& ctx_res); + // NOTE: DeviceContext hold resources. Used in training scenarios. + // The interface used by the training scene, DeviceContext will initialize + // all resources and delete them when destructing. + void Init(); + protected: + // NOTE: External users manage resources. Used in inference scenarios. + // The Set interface is for inference only, DeviceContext will mark the + // resource as external, and will not delete any resource when destructing. void SetEigenDevice(Eigen::DefaultDevice* device); private: - struct CPUImpl; - std::unique_ptr cpu_impl_; + struct Impl; + std::unique_ptr impl_; }; } // namespace pten diff --git a/paddle/pten/backends/gpu/CMakeLists.txt b/paddle/pten/backends/gpu/CMakeLists.txt new file mode 100644 index 0000000000000..09591f79ae8fc --- /dev/null +++ b/paddle/pten/backends/gpu/CMakeLists.txt @@ -0,0 +1,9 @@ +if(WITH_GPU) + add_subdirectory(cuda) + nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda) +elseif(WITH_ROCM) + add_subdirectory(rocm) + hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda) +endif() + +cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3) diff --git a/paddle/pten/backends/gpu/cuda/CMakeLists.txt b/paddle/pten/backends/gpu/cuda/CMakeLists.txt new file mode 100644 index 0000000000000..7eb1983a793bc --- /dev/null +++ b/paddle/pten/backends/gpu/cuda/CMakeLists.txt @@ -0,0 +1 @@ +nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda) diff --git a/paddle/pten/backends/gpu/cuda/cuda_helper.h b/paddle/pten/backends/gpu/cuda/cuda_helper.h new file mode 100644 index 0000000000000..baa1e43ce3c89 --- /dev/null +++ b/paddle/pten/backends/gpu/cuda/cuda_helper.h @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace pten { +namespace backends { +namespace gpu { + +/* + * Summary: Grid stride looping macro in CUDA kernel + * + * [ Why need this macro? ] + * + * The original looping in CUDA kernel is: + * + * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + * i += blockDim.x * gridDim.x)` + * + * This for condition is risky. The value of `blockIdx.x * blockDim.x` + * may be large, such as over 1GB, the first iteration is no problem here, + * but when `i += blockDim.x * gridDim.x` is executed, the value of i + * will greater than INT_MAX and overflow becomes negative value, at + * this time, the cycle condition `i < (n)` is still satisfied, so it + * will cause illegal access to cuda memory. + * + * Here is a real example in ERINE, it will trigger above error. + * The related data are: + * - blockIdx.x = 2172938 + * - blockDim.x = 512 + * - blockIdx.x * blockDim.x = 1112543864 + * - INT_MAX = 2147483647 + * + * So we polish the for condition as follow, the int64_t __index__ will + * prevent overflow in the loop increment. + * + * Parameters: + * - i: loop index + * - num: total element numbers + * + * Examples: + * template + * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, + * const int d, const int remain) { + * CUDA_KERNEL_LOOP(index, num) { + * int idx_n = index / d; + * int idx_remain = index % remain; + * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; + * } + * } + * +*/ + +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += blockDim.x * gridDim.x, i = __index__) + +} // namespace gpu +} // namespace backends +} // namespace pten diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc b/paddle/pten/backends/gpu/cuda/cuda_info.cc similarity index 71% rename from paddle/fluid/platform/device/gpu/cuda/cuda_info.cc rename to paddle/pten/backends/gpu/cuda/cuda_info.cc index 6109ed6554318..55766facac815 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc +++ b/paddle/pten/backends/gpu/cuda/cuda_info.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,20 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/pten/backends/gpu/gpu_info.h" + +// TODO(pten): remove fluid headers. #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/lock_guard_ptr.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/platform/monitor.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" static std::once_flag g_device_props_size_init_flag; static std::vector> g_device_props_init_flags; -static std::vector g_device_props; +static std::vector g_device_props; + +namespace pten { +namespace backends { +namespace gpu { -namespace paddle { -namespace platform { int DnnVersion() { if (!dynload::HasCUDNN()) return -1; return dynload::cudnnGetVersion(); @@ -75,11 +74,13 @@ int GetGPUDeviceCount() { } int GetGPUComputeCapability(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int major, minor; auto major_error_code = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); @@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) { } int GetGPURuntimeVersion(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int runtime_version = 0; PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); return runtime_version; } int GetGPUDriverVersion(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int driver_version = 0; PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version)); return driver_version; @@ -120,11 +125,13 @@ bool TensorCoreAvailable() { } int GetGPUMultiProcessors(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); @@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) { } int GetGPUMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute( &count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); @@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { } int GetGPUMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); @@ -162,32 +173,34 @@ int GetCurrentDeviceId() { return device_id; } -dim3 GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( +std::array GetGpuMaxGridDimSize(int id) { + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); - dim3 ret; + id, + GetGPUDeviceCount())); + std::array ret; int size; auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); - ret.x = size; + ret[0] = size; auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); - ret.y = size; + ret[1] = size; auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); - ret.z = size; + ret[2] = size; return ret; } const gpuDeviceProp &GetDeviceProperties(int id) { std::call_once(g_device_props_size_init_flag, [&] { int gpu_num = 0; - gpu_num = platform::GetGPUDeviceCount(); + gpu_num = GetGPUDeviceCount(); g_device_props_init_flags.resize(gpu_num); g_device_props.resize(gpu_num); for (int i = 0; i < gpu_num; ++i) { @@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) { }); if (id == -1) { - id = platform::GetCurrentDeviceId(); + id = GetCurrentDeviceId(); } if (id < 0 || id >= static_cast(g_device_props.size())) { - PADDLE_THROW(platform::errors::OutOfRange( + PADDLE_THROW(paddle::platform::errors::OutOfRange( "The device id %d is out of range [0, %d), where %d is the number of " "devices on this machine. Because the device id should be greater than " "or equal to zero and smaller than the number of gpus. Please input " "appropriate device again!", - id, static_cast(g_device_props.size()), + id, + static_cast(g_device_props.size()), static_cast(g_device_props.size()))); } @@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) { void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); } -void GpuMemcpyAsync(void *dst, const void *src, size_t count, - gpuMemcpyKind kind, gpuStream_t stream) { +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream) { PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream)); } -void GpuMemcpySync(void *dst, const void *src, size_t count, +void GpuMemcpySync(void *dst, + const void *src, + size_t count, gpuMemcpyKind kind) { PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind)); } -void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, - int src_device, size_t count, gpuStream_t stream) { +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream) { PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); } -void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, - int src_device, size_t count) { +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count) { PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpyPeer(dst, dst_device, src, src_device, count)); } @@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) { void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); } gpuError_t GpuGetLastError() { return cudaGetLastError(); } -} // namespace platform -} // namespace paddle + +} // namespace gpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/gpu/forwards.h b/paddle/pten/backends/gpu/forwards.h new file mode 100644 index 0000000000000..d0787159e1e30 --- /dev/null +++ b/paddle/pten/backends/gpu/forwards.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// Forward-declares CUDA API types used in platform-agnostic wrapper headers. +#pragma once + +/// Forward declaration of Eigen types. +namespace Eigen { +struct GpuDevice; +} // namespace Eigen + +/// Forward declaration of CUDA types. + +// Forward declaration of CUDA runtime types. +using cudaStream_t = struct CUstream_st *; +using cudaEvent_t = struct CUevent_st *; + +// Forward declaration of cuDNN types. +using cudnnHandle_t = struct cudnnContext *; +using cudnnTensorDescriptor_t = struct cudnnTensorStruct *; +using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *; +using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *; +using cudnnFilterDescriptor_t = struct cudnnFilterStruct *; +using cudnnLRNDescriptor_t = struct cudnnLRNStruct *; +using cudnnActivationDescriptor_t = struct cudnnActivationStruct *; +using cudnnSpatialTransformerDescriptor_t = + struct cudnnSpatialTransformerStruct *; +using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *; +using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *; +using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *; +using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *; +using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *; +using cudnnRNNDescriptor_t = struct cudnnRNNStruct *; +using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *; +using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *; +using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *; +using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *; +using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *; +using cudnnAttnDescriptor_t = struct cudnnAttnStruct *; +using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *; +using cudnnFusedOpsVariantParamPack_t = + struct cudnnFusedOpsVariantParamStruct *; +using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *; + +// Forward declaration of cuBLAS types. +using cublasHandle_t = struct cublasContext *; + +// Forward declaration of cuSOLVER types. +using cusolverDnHandle_t = struct cusolverDnContext *; + +// Forward declaration of cuSparse types. +using cusparseHandle_t = struct cusparseContext *; + +// Forward declaration of cuFFT types. +using cufftHandle = int; + +// Forward declaration of NCCL types. +using ncclComm_t = struct ncclComm *; + +/// Forward declaration of ROCM types. +#include + +using hipDevice_t = int; +using hipCtx_t = struct ihipCtx_t *; +using hipModule_t = struct ihipModule_t *; +using hipStream_t = struct ihipStream_t *; +using hipEvent_t = struct ihipEvent_t *; +using hipFunction_t = struct ihipModuleSymbol_t *; + +// Forward declaration of MIOpen types. +using miopenHandle_t = struct miopenHandle *; +using miopenAcceleratorQueue_t = hipStream_t; +using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *; +using miopenTensorDescriptor_t = struct miopenTensorDescriptor *; +using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *; +using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *; +using miopenLRNDescriptor_t = struct miopenLRNDescriptor *; +using miopenActivationDescriptor_t = struct miopenActivationDescriptor *; +using miopenRNNDescriptor_t = struct miopenRNNDescriptor *; +using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *; +using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *; +using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *; +using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *; +using miopenOperatorArgs_t = struct miopenOperatorArgs *; +using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes); +// using miopenDeallocatorFunction = void *(*)(void *context, void *memory); +// struct miopenConvAlgoPerf_t; +// struct miopenConvSolution_t; + +// Forward declaration of rocBLAS types. +using rocblas_handle = struct _rocblas_handle *; + +// Forward declaration of hipfft types. +using hipfftHandle = struct hipfftHandle_t *; + +// Forward declaration of rocSOLVER types. +using rocsolver_handle = rocblas_handle; + +// Forward declaration of rocSparse types. +using rocsparse_handle = struct _rocsparse_handle *; diff --git a/paddle/pten/backends/gpu/gpu_context.cc b/paddle/pten/backends/gpu/gpu_context.cc new file mode 100644 index 0000000000000..1e707c46cc93d --- /dev/null +++ b/paddle/pten/backends/gpu/gpu_context.cc @@ -0,0 +1,899 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/pten/backends/gpu/gpu_context.h" +#include +#include +#include +#include +#include + +#include "paddle/pten/api/ext/exception.h" + +#include "paddle/pten/backends/gpu/gpu_decls.h" +#include "paddle/pten/backends/gpu/gpu_info.h" +#include "paddle/pten/common/float16.h" +#include "paddle/pten/common/place.h" +#include "paddle/pten/core/allocator.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/pten/backends/dynload/cublas.h" +#include "paddle/pten/backends/dynload/cudnn.h" +#include "paddle/pten/backends/dynload/cusolver.h" +#include "paddle/pten/backends/dynload/cusparse.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +#include "paddle/pten/backends/dynload/nccl.h" +#endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_HIP +#include "paddle/pten/backends/dynload/miopen.h" +#include "paddle/pten/backends/dynload/rocblas.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) +#include "paddle/pten/backends/dynload/rccl.h" +#endif // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) +#endif // PADDLE_WITH_HIP + +// NOTE: The paddle framework should add WITH_EIGEN option to support compile +// without eigen. +#include "unsupported/Eigen/CXX11/Tensor" + +// TODO(pten): remove fluid header. +#include "paddle/fluid/platform/enforce.h" + +namespace pten { + +namespace internal { + +class EigenGpuStreamDevice : public Eigen::StreamInterface { + public: + EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { + Eigen::initializeDeviceProp(); + } + ~EigenGpuStreamDevice() override {} + + void Reinitialize(gpuStream_t cuda_stream, + Allocator* allocator, + GPUPlace place) { + stream_ = cuda_stream; + place_ = place; + allocator_ = allocator; + device_prop_ = &Eigen::m_deviceProperties[place.device]; + } + + const gpuStream_t& stream() const override { return stream_; } + + const gpuDeviceProp& deviceProperties() const override { + return *device_prop_; + } + + void* allocate(size_t num_bytes) const override { + if (UNLIKELY(num_bytes == 0)) { + return nullptr; + } + auto buf = allocator_->Allocate(num_bytes); + VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested " + << num_bytes; + void* retv = buf->ptr(); + { + std::lock_guard lock(mtx_); + allocations_.emplace(retv, std::move(buf)); + } + return retv; + } + + void deallocate(void* buffer) const override { + if (LIKELY(buffer)) { + std::lock_guard lock(mtx_); + allocations_.erase(buffer); + } + } + + void* scratchpad() const override { + if (scratch_ == NULL) { + scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + unsigned int* semaphore() const override { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + Eigen::kGpuScratchSize; + semaphore_ = reinterpret_cast(scratch); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); +#endif + } + return semaphore_; + } + + private: + GPUPlace place_; + gpuStream_t stream_; // not owned; + Allocator* allocator_; // not owned; + const gpuDeviceProp* device_prop_; // not owned; + mutable void* scratch_; + mutable unsigned int* semaphore_; + mutable std::mutex mtx_; // to protect allocations_ + mutable std::unordered_map allocations_; +}; + +#ifdef PADDLE_WITH_HIP +static void StreamCallbackFunc(gpuStream_t stream, + gpuError_t status, + void* user_data) +#endif +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10000 + static void CUDART_CB StreamCallbackFunc(void* user_data) +#else + static void CUDART_CB + StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data) +#endif +#endif +{ + std::unique_ptr> func( + reinterpret_cast*>(user_data)); + (*func)(); +} + +} // namespace internal + +class DnnWorkspaceHandle { + public: + explicit inline DnnWorkspaceHandle(Allocator* allocator) + : allocator_(allocator) {} + + inline void RunFunc(const std::function& cudnn_func, + size_t required_workspace_bytes) { + if (required_workspace_bytes > WorkspaceSize()) { + ReallocWorkspace(required_workspace_bytes); + } + VLOG(2) << "Cudnn workspace size at RunFunc: " + << static_cast(WorkspaceSize()) / (1 << 20) << " MB"; + { + std::lock_guard guard(mtx_); + cudnn_func(allocation_ ? allocation_->ptr() : nullptr); + } + } + + /*! \brief Thread which call RunFuncSync() would release gpu memory after + * running the function. Currently this function is only used when cudnn + * exhaustive searching and callers have to guarantee that the input function + * is host blocking */ + inline void RunFuncSync(const std::function& cudnn_func, + size_t required_workspace_bytes) { + RunFunc(cudnn_func, required_workspace_bytes); + ResetWorkspace(); + } + + inline size_t WorkspaceSize() { + if (allocation_ == nullptr) { + return 0; + } + return allocation_->size(); + } + + void ResetWorkspace() { allocation_ = nullptr; } + + void ReallocWorkspace(size_t required_workspace_bytes) { + if (required_workspace_bytes <= WorkspaceSize()) return; + // reset allocation first before re-allocate to save memory + allocation_.reset(); + allocation_ = allocator_->Allocate(required_workspace_bytes); + } + + private: + Allocator::AllocationPtr allocation_{nullptr}; + Allocator* allocator_{nullptr}; + std::mutex mtx_; +}; + +struct GPUContext::Impl { + void Init() { + owned_ = true; + backends::gpu::GPUDeviceGuard guard(place_.device); + InitGpuProperties(); + InitStream(); + InitEigenDevice(); + InitBlasHandle(); + InitDNNHandle(); + InitSolverHandle(); + InitSparseHandle(); + InitDnnWorkspace(); + } + + void PartialInitWithoutAllocator() { + owned_ = true; + backends::gpu::GPUDeviceGuard guard(place_.device); + InitGpuProperties(); + InitStream(); + InitBlasHandle(); + InitDNNHandle(); + InitSolverHandle(); + InitSparseHandle(); + } + + void PartialInitWithAllocator() { + owned_ = true; + backends::gpu::GPUDeviceGuard guard(place_.device); + InitEigenDevice(); + InitDnnWorkspace(); + } + + Impl() : place_(GPUPlace()) {} + + explicit Impl(const GPUPlace& place) : place_(place) {} + + ~Impl() { + backends::gpu::GPUDeviceGuard guard(place_.device); + DestoryInternalWorkspace(); + DestoryInternalEigenDevice(); + DestroyInternalSparseHandle(); + DestroyInternalSolverHandle(); + DestroyInternalDnnHandle(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nccl_comm_) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); + } +#endif + DestroyInternalBlasHandle(); + DestoryInternalStream(); + } + + const Place& GetPlace() const { return place_; } + + bool IsTensorCoreAvailable() const { + return blas_tensor_core_handle_ != nullptr; + } + + void InitGpuProperties() { + backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId()); + compute_capability_ = + backends::gpu::GetGPUComputeCapability(place_.GetDeviceId()); + multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId()); + max_threads_per_mp_ = + backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId()); + max_grid_dim_size_ = + backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId()); + max_threads_per_block_ = + backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId()); + driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId()); + runtime_version_ = + backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId()); + + // TODO(wilber): glog may be replaced in the future? + LOG_FIRST_N(WARNING, 1) + << "Please NOTE: device: " << static_cast(place_.device) + << ", GPU Compute Capability: " << compute_capability_ / 10 << "." + << compute_capability_ % 10 + << ", Driver API Version: " << driver_version_ / 1000 << "." + << (driver_version_ % 100) / 10 + << ", Runtime API Version: " << runtime_version_ / 1000 << "." + << (runtime_version_ % 100) / 10; +#ifdef PADDLE_WITH_HIP + size_t miopen_major, miopen_minor, miopen_patch; + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch)); + auto cudnn_dso_ver = + (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10; + auto compile_miopen_version = MIOPEN_VERSION / 10; + if (cudnn_dso_ver < static_cast(compile_miopen_version)) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << static_cast(place_.device) + << ". The installed Paddle is compiled with MIOPEN " + << compile_miopen_version / 100 << "." << compile_miopen_version % 100 + << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100 + << "." << cudnn_dso_ver % 100 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible MIOPEN " + "version."; + } +#else + size_t cudnn_dso_ver = dynload::cudnnGetVersion(); + LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place_.device) + << ", cuDNN Version: " << cudnn_dso_ver / 1000 + << "." << (cudnn_dso_ver % 1000) / 100 << "."; + + // Check CUDA/CUDNN version compatiblity + auto local_cuda_version = + (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10; + auto compile_cuda_version = + (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10; + if (local_cuda_version < compile_cuda_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << static_cast(place_.device) + << ". The installed Paddle is compiled with CUDA " + << compile_cuda_version / 10 << "." << compile_cuda_version % 10 + << ", but CUDA runtime version in your machine is " + << local_cuda_version / 10 << "." << local_cuda_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDA " + "version."; + } +#endif + } + + void InitDnnWorkspace() { + PD_CHECK(allocator_ != nullptr, + "the device allocator for gpu context is nullptr."); + workspace_ = new DnnWorkspaceHandle(allocator_); + } + + void DestoryInternalWorkspace() { + if (owned_ && workspace_ != nullptr) { + delete workspace_; + stream_ = nullptr; + } + } + + DnnWorkspaceHandle* GetDnnWorkspace() { + PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr."); + return workspace_; + } + + void InitStream() { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0)); +#endif + } + + void DestoryInternalStream() { + if (owned_ && stream_ != nullptr) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_)); +#endif + } + stream_ = nullptr; + } + + void SetStream(gpuStream_t stream) { stream_ = stream; } + + gpuStream_t GetStream() const { + PD_CHECK(stream_ != nullptr, "the gpu stream is nullptr."); + return stream_; + } + + void InitEigenDevice() { + PD_CHECK(allocator_ != nullptr, + "the allocator for eigen device is nullptr."); + eigen_stream_.reset(new internal::EigenGpuStreamDevice()); + eigen_stream_->Reinitialize(stream_, allocator_, place_); + eigen_device_ = new Eigen::GpuDevice(eigen_stream_.get()); + } + + void DestoryInternalEigenDevice() { + if (owned_ && eigen_device_ != nullptr) { + delete eigen_device_; + eigen_device_ = nullptr; + } + } + + void SetEigenDevice(Eigen::GpuDevice* device) { eigen_device_ = device; } + + Eigen::GpuDevice* eigen_device() const { + PD_CHECK(eigen_device_ != nullptr, "the gpu eigen_device is nullptr."); + return eigen_device_; + } + + void InitBlasHandle() { +#ifdef PADDLE_WITH_HIP + pten::dynload::rocblas_create_handle(&blas_handle_); + pten::dynload::rocblas_set_stream(blas_handle_, stream_); +#else // PADDLE_WITH_CUDA + PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasCreate(&blas_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( + pten::dynload::cublasSetStream(blas_handle_, stream_)); +#if CUDA_VERSION >= 9000 + PADDLE_RETRY_CUDA_SUCCESS( + pten::dynload::cublasCreate(&blas_tensor_core_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( + pten::dynload::cublasSetStream(blas_tensor_core_handle_, stream_)); + PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode( + blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); +#if CUDA_VERSION >= 11000 + PADDLE_RETRY_CUDA_SUCCESS( + pten::dynload::cublasCreate(&blas_tf32_tensor_core_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( + pten::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_)); + PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode( + blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH)); +#endif // CUDA_VERSION >= 11000 +#endif // CUDA_VERSION >= 9000 +#endif // PADDLE_WITH_HIP + } + + void DestroyInternalBlasHandle() { +#ifdef PADDLE_WITH_HIP + if (owned_ && blas_handle_ != nullptr) { + pten::dynload::rocblas_destroy_handle(blas_handle_); + blas_handle_ = nullptr; + } +#else + if (owned_ && blas_handle_ != nullptr) { + pten::dynload::cublasDestroy(blas_handle_); + blas_handle_ = nullptr; + } + if (owned_ && blas_tensor_core_handle_ != nullptr) { + pten::dynload::cublasDestroy(blas_tensor_core_handle_); + blas_tensor_core_handle_ = nullptr; + } + if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) { + pten::dynload::cublasDestroy(blas_tf32_tensor_core_handle_); + blas_tf32_tensor_core_handle_ = nullptr; + } +#endif // PADDLE_WITH_HIP + } + + blasHandle_t GetBlasHandle() const { + PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr."); + return blas_handle_; + } + + void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; } + + void InitDNNHandle() { + if (pten::dynload::HasCUDNN()) { +#ifdef PADDLE_WITH_HIP + size_t miopen_major, miopen_minor, miopen_patch; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( + &miopen_major, &miopen_minor, &miopen_patch)); + auto local_miopen_version = + (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10; + auto compile_miopen_version = MIOPEN_VERSION / 10; + if (local_miopen_version < static_cast(compile_miopen_version)) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with MIOPEN " + << compile_miopen_version / 100 << "." + << compile_miopen_version % 100 + << ", but MIOPEN version in your machine is " + << local_miopen_version / 100 << "." << local_miopen_version % 100 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible MIOPEN " + "version."; + } + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenSetStream(dnn_handle_, stream_)); +#else + auto local_cudnn_version = pten::dynload::cudnnGetVersion() / 100; + auto compile_cudnn_version = CUDNN_VERSION / 100; + if (local_cudnn_version < static_cast(compile_cudnn_version)) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with CUDNN " + << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10 + << ", but CUDNN version in your machine is " + << local_cudnn_version / 10 << "." << local_cudnn_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDNN " + "version."; + } + PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cudnnCreate(&dnn_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( + pten::dynload::cudnnSetStream(dnn_handle_, stream_)); +#endif + } else { + dnn_handle_ = nullptr; + } + } + + dnnHandle_t GetDnnHandle() { + PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr."); + return dnn_handle_; + } + + void DestroyInternalDnnHandle() { +#ifdef PADDLE_WITH_HIP + if (owned_ && dnn_handle_ != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::miopenDestroy(dnn_handle_)); + dnn_handle_ = nullptr; + } +#else + if (owned_ && dnn_handle_ != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::cudnnDestroy(dnn_handle_)); + dnn_handle_ = nullptr; + } +#endif // PADDLE_WITH_HIP + } + + void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; } + + void InitSolverHandle() { +#ifndef PADDLE_WITH_HIP + PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cusolverDnCreate(&solver_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( + pten::dynload::cusolverDnSetStream(solver_handle_, stream_)); +#endif + } + + void DestroyInternalSolverHandle() { +#ifndef PADDLE_WITH_HIP + if (owned_ && solver_handle_ != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + pten::dynload::cusolverDnDestroy(solver_handle_)); + solver_handle_ = nullptr; + } +#endif + } + + solverHandle_t GetSolverHandle() const { + PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr."); + return solver_handle_; + } + + void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; } + + void InitSparseHandle() { +// ROCM is not yet supported +#if defined(PADDLE_WITH_CUDA) +// The generic APIs is supported from CUDA10.1 +#if CUDA_VERSION >= 10010 + PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( + dynload::cusparseSetStream(sparse_handle_, stream_)); +#endif +#endif + } + + void DestroyInternalSparseHandle() { +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10010 + if (owned_ && sparse_handle_ != nullptr) { + PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_)); + sparse_handle_ = nullptr; + } +#endif +#endif + } + + sparseHandle_t GetSparseHandle() const { + PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr."); + return sparse_handle_; + } + + void SetSparseHandle(sparseHandle_t handle) { sparse_handle_ = handle; } + + void Wait() const { +#ifdef PADDLE_WITH_HIP + hipError_t e_sync = hipSuccess; +#if !defined(_WIN32) + e_sync = hipStreamSynchronize(stream_); +#else + while (e_sync = hipStreamQuery(stream_)) { + if (e_sync == hipErrorNotReady) continue; + break; + } +#endif // !defined(_WIN32) +#else // PADDLE_WITH_HIP + cudaError_t e_sync = cudaSuccess; +#if !defined(_WIN32) + e_sync = cudaStreamSynchronize(stream_); +#else + while (e_sync = cudaStreamQuery(stream_)) { + if (e_sync == cudaErrorNotReady) continue; + break; + } +#endif // !defined(_WIN32) +#endif // PADDLE_WITH_HIP + + PADDLE_ENFORCE_GPU_SUCCESS(e_sync); + } + + void WaitEvent(gpuEvent_t ev) const { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0)); +#endif + } + + ncclComm_t GetNcclComm() const { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + // PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr."); + return nccl_comm_; +#endif + return nullptr; + } + + void SetNcclComm(ncclComm_t comm) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + nccl_comm_ = comm; +#endif + } + + inline void CublasCall( + const std::function& callback) const { + if (blas_tf32_tensor_core_handle_ != nullptr) { + std::lock_guard guard(blas_tf32_mtx_); + callback(blas_tf32_tensor_core_handle_); + } else { + std::lock_guard guard(blas_mtx_); + callback(blas_handle_); + } + } + + inline void TensorCoreCublasCallIfAvailable( + const std::function& callback) const { + if (blas_tensor_core_handle_ != nullptr) { + std::lock_guard guard(blas_tensor_core_mtx_); + callback(blas_tensor_core_handle_); + } else { + std::lock_guard guard(blas_mtx_); + callback(blas_handle_); + } + } + + inline void CusparseCall( + const std::function& callback) const { + std::lock_guard guard(sparse_mtx_); + callback(sparse_handle_); + } + + void RecordEvent(gpuEvent_t ev, const std::function& callback) const { + callback(); + RecordEvent(ev); + } + + void RecordEvent(gpuEvent_t ev) const { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_)); +#endif + } + + void AddStreamCallback(const std::function& callback) const { + // TODO(wilber): Do we need ThreadPool? + auto* func = new std::function([this, callback] { + std::lock_guard lock(stream_call_back_mtx_); + last_future_ = std::async(std::launch::deferred, [&]() { callback(); }); + }); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0)); +#endif +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10000 + PADDLE_ENFORCE_GPU_SUCCESS( + cudaLaunchHostFunc(stream_, internal::StreamCallbackFunc, func)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0)); +#endif +#endif + } + + void WaitStreamCallback() const { +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) + pten::backends::gpu::GpuStreamSync(stream_); +#endif + { + std::lock_guard lock(stream_call_back_mtx_); + if (last_future_.valid()) { + last_future_.wait(); + } + } + } + + bool owned_{false}; + Place place_; + int compute_capability_; + int runtime_version_; + int driver_version_; + int multi_process_; + int max_threads_per_mp_; + int max_threads_per_block_; + std::array max_grid_dim_size_; + + gpuStream_t stream_{nullptr}; + Eigen::GpuDevice* eigen_device_{nullptr}; + blasHandle_t blas_handle_{nullptr}; + blasHandle_t blas_tensor_core_handle_{nullptr}; + blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; + dnnHandle_t dnn_handle_{nullptr}; + solverHandle_t solver_handle_{nullptr}; + sparseHandle_t sparse_handle_{nullptr}; + DnnWorkspaceHandle* workspace_{nullptr}; + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + // NCCL communicator (single process version) for NCCL collective operations. + // NCCL collective operations provides fast collectives over multiple GPUs + // both within and across nodes. + // But, this collectives is used for collectives over multiple GPUs within + // nodes. + + // NOTE: Distributed communicator, distributed framework manages its + // resources. + ncclComm_t nccl_comm_{nullptr}; +#endif + + mutable std::mutex blas_mtx_; + mutable std::mutex blas_tensor_core_mtx_; + mutable std::mutex blas_tf32_mtx_; + mutable std::mutex sparse_mtx_; + mutable std::mutex stream_call_back_mtx_; + mutable std::future last_future_; + + Allocator* allocator_{nullptr}; // external resource. + // A internal resouce to initinalize eigen_device. + std::unique_ptr eigen_stream_{nullptr}; +}; + +GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique()) {} + +GPUContext::GPUContext(const GPUPlace& place) + : DeviceContext(), impl_(std::make_unique(place)) {} + +GPUContext::~GPUContext() = default; + +const Place& GPUContext::GetPlace() const { return impl_->GetPlace(); } + +gpuStream_t GPUContext::stream() const { return impl_->GetStream(); } + +dnnHandle_t GPUContext::cudnn_handle() const { return impl_->GetDnnHandle(); } + +blasHandle_t GPUContext::cublas_handle() const { + return impl_->GetBlasHandle(); +} + +solverHandle_t GPUContext::cusolver_dn_handle() const { + return impl_->GetSolverHandle(); +} + +sparseHandle_t GPUContext::cusparse_handle() const { + return impl_->GetSparseHandle(); +} + +void GPUContext::Wait() const { impl_->Wait(); } + +void GPUContext::WaitEvent(gpuEvent_t ev) const { impl_->WaitEvent(ev); } + +bool GPUContext::tensor_core_available() const { + return impl_->IsTensorCoreAvailable(); +} + +int GPUContext::GetComputeCapability() const { + return impl_->compute_capability_; +} + +int GPUContext::GetMaxPhysicalThreadCount() const { + return impl_->multi_process_ * impl_->max_threads_per_mp_; +} + +int GPUContext::GetSMCount() const { return impl_->multi_process_; } + +int GPUContext::GetMaxThreadsPerBlock() const { + return impl_->max_threads_per_block_; +} + +std::array GPUContext::GetCUDAMaxGridDimSize() const { + return impl_->max_grid_dim_size_; +} + +Eigen::GpuDevice* GPUContext::eigen_device() const { + return impl_->eigen_device(); +} + +DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() { + return impl_->GetDnnWorkspace(); +} + +void GPUContext::CublasCall( + const std::function& callback) const { + impl_->CublasCall(callback); +} + +void GPUContext::TensorCoreCublasCallIfAvailable( + const std::function& callback) const { + impl_->TensorCoreCublasCallIfAvailable(callback); +} + +void GPUContext::CusparseCall( + const std::function& callback) const { + impl_->CusparseCall(callback); +} + +void GPUContext::RecordEvent(gpuEvent_t ev, + const std::function& callback) const { + impl_->RecordEvent(ev, callback); +} + +void GPUContext::RecordEvent(gpuEvent_t ev) const { impl_->RecordEvent(ev); } + +void GPUContext::AddStreamCallback( + const std::function& callback) const { + impl_->AddStreamCallback(callback); +} + +void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); } + +ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); } + +void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); } + +void GPUContext::Init() { + impl_->allocator_ = const_cast(&this->GetAllocator()); + impl_->Init(); +} + +void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); } + +void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) { + impl_->SetEigenDevice(device); +} + +void GPUContext::SetBlasHandle(blasHandle_t blas) { + impl_->SetBlasHandle(blas); +} + +void GPUContext::SetDnnHandle(dnnHandle_t handle) { + impl_->SetDnnHandle(handle); +} + +void GPUContext::SetSolverHandle(solverHandle_t handle) { + impl_->SetSolverHandle(handle); +} + +void GPUContext::SetSparseHandle(sparseHandle_t handle) { + impl_->SetSparseHandle(handle); +} + +void GPUContext::SetDnnWorkspaceHandle(DnnWorkspaceHandle* handle) { + impl_->workspace_ = handle; +} + +void GPUContext::PartialInitWithoutAllocator() { + impl_->PartialInitWithoutAllocator(); +} + +void GPUContext::PartialInitWithAllocator() { + impl_->allocator_ = const_cast(&this->GetAllocator()); + impl_->PartialInitWithAllocator(); +} + +void GPUContext::SetComputeCapability(int val) { + impl_->compute_capability_ = val; +} + +void GPUContext::SetMaxThreadsPerMultiProcessor(int val) { + impl_->max_threads_per_mp_ = val; +} + +void GPUContext::SetMultiProcessors(int val) { impl_->multi_process_ = val; } + +void GPUContext::SetMaxThreadsPerBlock(int val) { + impl_->max_threads_per_block_ = val; +} + +void GPUContext::SetMaxGridDimSize(const std::array& val) { + impl_->max_grid_dim_size_ = val; +} + +void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; } + +void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; } + +} // namespace pten diff --git a/paddle/pten/backends/gpu/gpu_context.h b/paddle/pten/backends/gpu/gpu_context.h index 1adfd155ce334..2a2be0e44b4f0 100644 --- a/paddle/pten/backends/gpu/gpu_context.h +++ b/paddle/pten/backends/gpu/gpu_context.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,13 +14,162 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/device_context.h" +#include +#include +#include "paddle/pten/backends/gpu/forwards.h" +#include "paddle/pten/backends/gpu/gpu_decls.h" +#include "paddle/pten/backends/gpu/gpu_helper.h" +#include "paddle/pten/common/place.h" +#include "paddle/pten/core/device_context.h" namespace pten { -using GPUContext = paddle::platform::CUDADeviceContext; -} // namespace pten -#endif +class DnnWorkspaceHandle; + +class GPUContext : public DeviceContext { + public: + GPUContext(); + + explicit GPUContext(const GPUPlace& place); + + virtual ~GPUContext(); + + /*! \brief Return place in the device context. */ + const Place& GetPlace() const override; + + /*! \brief Return gpu stream in the device context. */ + gpuStream_t stream() const; + + /*! \brief Return cudnn handle in the device context. */ + dnnHandle_t cudnn_handle() const; + + /*! \brief Return cublas handle in the device context. */ + blasHandle_t cublas_handle() const; + + /*! \brief Return cusolver handle in the device context. */ + solverHandle_t cusolver_dn_handle() const; + + /*! \brief Return cusparse handle in the device context. */ + sparseHandle_t cusparse_handle() const; + + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override; + + /*! \brief Wait for event in the stream. */ + void WaitEvent(gpuEvent_t ev) const; + + /*! \brief Check whether tensor core is supported */ + bool tensor_core_available() const; + + /*! \brief Return compute capability in the device context. */ + int GetComputeCapability() const; + + /*! \brief Return the max physical thread count in the device context */ + int GetMaxPhysicalThreadCount() const; + + /*! \brief Return the SM count in the device context */ + int GetSMCount() const; + + /*! \brief Return the Max thread num of block in the device context */ + int GetMaxThreadsPerBlock() const; + + /*! \brief Return the max grid dim size in the device context */ + std::array GetCUDAMaxGridDimSize() const; + + /*! \brief Return eigen device in the device context. */ + Eigen::GpuDevice* eigen_device() const; + + /*! \brief Return a cudnn workspace handle to call multiple cudnn + * functions without interrupting by other threads. + * Once the first cudnn function is called by the handle, a lock + * would be acquired to prevent other threads from accessing the + * workspace. Once the handle is destructed, the lock would be released. + */ + DnnWorkspaceHandle* cudnn_workspace_handle(); + + public: + /*! \brief Call cublas function safely. */ + void CublasCall(const std::function&) const; + + /*! \brief Call cublas function with Tensor Core safely. If + Tensor Core is not available, use DEFAULT_MATH instead. */ + void TensorCoreCublasCallIfAvailable( + const std::function&) const; + + /*! \brief Call cusparse function safely. */ + void CusparseCall(const std::function&) const; + + void RecordEvent(gpuEvent_t ev, const std::function& callback) const; + + void RecordEvent(gpuEvent_t ev) const; + + void AddStreamCallback(const std::function& callback) const; + + void WaitStreamCallback() const; + + public: + /*! \brief Return nccl communicators. */ + ncclComm_t nccl_comm() const; + + /*! \brief Set nccl communicators. */ + void set_nccl_comm(ncclComm_t comm); + + public: + // NOTE: DeviceContext hold resources. Used in training scenarios. + // The interface used by the training scene, DeviceContext will initialize + // all resources and delete them when destructing. + // Note that you must set the Allocator before calling Init function. + void Init(); + + // TODO(wilber): Why does the GetAllocator interface require a stream + // parameter? + // The temporary trick method bypasses this problem, and the following + // interfaces + // need to be deleted later. + + // Note that this is a trick implementation, which can be used to partially + // initialize when the SetAllocator interface is not called. + void PartialInitWithoutAllocator(); + // Note that this is a trick implementation that can be used to initialize + // resources that require an Allocator when the SetAllocator interface is + // called. + void PartialInitWithAllocator(); + + protected: + // NOTE: External users manage resources. Used in inference scenarios. + // The Set interface is for inference only, DeviceContext will mark the + // resource as external, and will not delete any resource when destructing. + void SetStream(gpuStream_t); + + void SetEigenDevice(Eigen::GpuDevice*); + + void SetBlasHandle(blasHandle_t); + + void SetDnnHandle(dnnHandle_t); + + void SetSolverHandle(solverHandle_t); + + void SetSparseHandle(sparseHandle_t); + + void SetDnnWorkspaceHandle(DnnWorkspaceHandle*); + + void SetComputeCapability(int val); + + void SetMaxThreadsPerMultiProcessor(int val); + + void SetMultiProcessors(int val); + + void SetMaxThreadsPerBlock(int val); + + void SetMaxGridDimSize(const std::array& val); + + void SetDriverVersion(int val); + + void SetRuntimeVersion(int val); + + private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace pten diff --git a/paddle/pten/backends/gpu/gpu_decls.h b/paddle/pten/backends/gpu/gpu_decls.h new file mode 100644 index 0000000000000..747244c4407e0 --- /dev/null +++ b/paddle/pten/backends/gpu/gpu_decls.h @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/backends/gpu/forwards.h" + +namespace pten { + +#ifdef PADDLE_WITH_HIP +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = ROCM_TYPE; + +#else // PADDLE_WITH_CDUA + +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = CUDA_TYPE; +#endif + +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); + +DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, + cudnnActivationStruct, + miopenActivationDescriptor); +DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, + cudnnTensorStruct, + miopenTensorDescriptor); +DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, + cudnnFilterStruct, + miopenTensorDescriptor); +DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, + cudnnFilterDescriptor_t, + miopenTensorDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, + cudnnConvolutionStruct, + miopenConvolutionDescriptor); +DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, + cudnnConvolutionDescriptor_t, + miopenConvolutionDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, + cudnnPoolingDescriptor_t, + miopenPoolingDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, + cudnnDropoutDescriptor_t, + miopenDropoutDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); + +DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); + +DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle); + +DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle); + +#undef DECLARE_TYPE_FOR_GPU + +using CUDAGraphID = unsigned long long; // NOLINT + +} // namespace pten diff --git a/paddle/pten/backends/gpu/gpu_helper.h b/paddle/pten/backends/gpu/gpu_helper.h new file mode 100644 index 0000000000000..e9254115ed5bc --- /dev/null +++ b/paddle/pten/backends/gpu/gpu_helper.h @@ -0,0 +1,26 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_HIP +#include "paddle/pten/backends/gpu/rocm/rocm_helper.h" +#else +#include "paddle/pten/backends/gpu/cuda/cuda_helper.h" +#endif + +#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int) + +#endif diff --git a/paddle/pten/backends/gpu/gpu_info.cc b/paddle/pten/backends/gpu/gpu_info.cc new file mode 100644 index 0000000000000..d1b8d4a362637 --- /dev/null +++ b/paddle/pten/backends/gpu/gpu_info.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/gpu/gpu_info.h" + +#include + +#include "gflags/gflags.h" + +DECLARE_string(selected_gpus); + +namespace pten { +namespace backends { +namespace gpu { + +static inline std::vector Split(std::string const& original, + char separator) { + std::vector results; + std::string token; + std::istringstream is(original); + while (std::getline(is, token, separator)) { + if (!token.empty()) { + results.push_back(token); + } + } + return results; +} + +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices() { + // use user specified GPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_gpus.empty()) { + auto devices_str = Split(FLAGS_selected_gpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetGPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +} // namespace gpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/gpu/gpu_info.h b/paddle/pten/backends/gpu/gpu_info.h new file mode 100644 index 0000000000000..59add6166d1d2 --- /dev/null +++ b/paddle/pten/backends/gpu/gpu_info.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#include +#include +#include +#include + +#include "paddle/pten/backends/gpu/gpu_types.h" + +namespace pten { +namespace backends { +namespace gpu { + +//! Get the version of dnn +int DnnVersion(); + +//! Get the total number of GPU devices in system. +int GetGPUDeviceCount(); + +//! Get the compute capability of the ith GPU (format: major * 10 + minor) +int GetGPUComputeCapability(int id); + +//! Get the runtime version of the ith GPU +int GetGPURuntimeVersion(int id); + +//! Get the driver version of the ith GPU +int GetGPUDriverVersion(int id); + +//! Wheter the current device support TensorCore +bool TensorCoreAvailable(); + +//! Get the MultiProcessors of the ith GPU. +int GetGPUMultiProcessors(int id); + +//! Get the MaxThreads of each MultiProcessor of the ith GPU. +int GetGPUMaxThreadsPerMultiProcessor(int id); + +//! Get the MaxThreads of each block of the ith GPU. +int GetGPUMaxThreadsPerBlock(int id); + +//! Get the current GPU device id in system. +int GetCurrentDeviceId(); + +//! Get the maximum GridDim size for GPU buddy allocator. +std::array GetGpuMaxGridDimSize(int); + +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices(); + +//! Get the properties of the ith GPU device. +const gpuDeviceProp &GetDeviceProperties(int id); + +//! Set the GPU device id for next execution. +void SetDeviceId(int device_id); + +//! Copy memory from address src to dst asynchronously. +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream); + +//! Copy memory from address src to dst synchronously. +void GpuMemcpySync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind); + +//! Copy memory from one device to another device asynchronously. +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream); + +//! Copy memory from one device to another device synchronously. +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count); + +//! Set memory dst with value count size asynchronously +void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream); + +//! Blocks until stream has completed all operations. +void GpuStreamSync(gpuStream_t stream); + +void GpuDestroyStream(gpuStream_t stream); + +// ! Blocks until device has completed all operations. +void GpuDeviceSync(); + +gpuError_t GpuGetLastError(); + +class GPUDeviceGuard { + public: + explicit inline GPUDeviceGuard(int dev_id) { + int prev_id = GetCurrentDeviceId(); + if (prev_id != dev_id) { + prev_id_ = prev_id; + SetDeviceId(dev_id); + } + } + inline ~GPUDeviceGuard() { + if (prev_id_ != -1) { + SetDeviceId(prev_id_); + } + } + GPUDeviceGuard(const GPUDeviceGuard &o) = delete; + GPUDeviceGuard &operator=(const GPUDeviceGuard &o) = delete; + + private: + int prev_id_{-1}; +}; + +} // namespace gpu +} // namespace backends +} // namespace pten + +#endif diff --git a/paddle/pten/backends/gpu/gpu_launch_config.h b/paddle/pten/backends/gpu/gpu_launch_config.h new file mode 100644 index 0000000000000..c166b3d2f80ed --- /dev/null +++ b/paddle/pten/backends/gpu/gpu_launch_config.h @@ -0,0 +1,181 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Used for compute gpu launch parameter config + +#pragma once + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_CUDA +#include +#else +#include +#endif + +#include +#include +#include +#include +#include "paddle/pten/backends/gpu/gpu_context.h" + +#ifdef __HIPCC__ +// HIP results in error or nan if > 256 +#define PREDEFINED_BLOCK_SIZE 256 +#else +/* CUDA performs better as thread_per_block + num is between [64, 512] */ +#define PREDEFINED_BLOCK_SIZE 512 +#endif + +namespace pten { +namespace backends { +namespace gpu { + +inline int DivUp(int a, int b) { return (a + b - 1) / b; } + +/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + for round integer value into next highest power of 2. */ +static inline int RoundToPowerOfTwo(int n) { + n--; + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); +#ifdef __HIPCC__ + return std::min(256, std::max(32, (n + 1))); +#else + return std::min(1024, std::max(32, (n + 1))); +#endif +} + +#ifdef WITH_NV_JETSON +// The number of threads cannot be assigned 1024 in some cases when the device +// is nano or tx2 . +inline void ChangeThreadNum(const pten::CUDAContext& context, + int* num_thread, + int alternative_num_thread = 512) { + if (context.GetComputeCapability() == 53 || + context.GetComputeCapability() == 62) { + *num_thread = alternative_num_thread; + } +} +#endif + +struct GpuLaunchConfig { + public: + GpuLaunchConfig() {} + + size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); } + + size_t GetGridSize() const { + return block_per_grid.x * block_per_grid.y * block_per_grid.z; + } + + size_t GetBlockSize() const { + return thread_per_block.x * thread_per_block.y * thread_per_block.z; + } + + int compute_capability = 0; + dim3 thread_per_block = dim3(1, 1, 1); + dim3 block_per_grid = dim3(1, 1, 1); +}; + +/* According to NVIDIA, if number of threads per block is 64/128/256/512, + * cuda performs better. And number of blocks should be greater (at least + * 2x~4x) than number of SMs. Hence, SM count is took into account within + * this function to determine the right number of threads per block. */ +inline GpuLaunchConfig GetGpuLaunchConfig1D(const pten::GPUContext& context, + int64_t numel, + int vec_size = 1) { + PADDLE_ENFORCE_GT(numel, + 0, + paddle::platform::errors::InvalidArgument( + "element quantity should be greater than 0," + " but received value is: %d.", + numel)); + // Get compute_capability + const int capability = context.GetComputeCapability(); + /* If thread number per block is 64/128/256/512, cuda performs better.*/ + int limit_threads = + std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock()); +#ifdef WITH_NV_JETSON + if (capability == 53 || capability == 62) { + limit_threads = 512; + } +#endif + int threads = limit_threads; + int sm_count = context.GetSMCount(); + int active_threads_num = numel / vec_size; + if (active_threads_num / (sm_count << 1) < limit_threads) { + // Round up threads number into an exponential multiple of 2, while number + // of acitve blocks is about twice of SM, to acquire better performance. + threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1)); + } else if (active_threads_num / (sm_count << 2) < limit_threads) { + // Round up threads number into an exponential multiple of 2, while number + // of acitve blocks is about 4 times of SM, to acquire better performance. + threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2)); + } + // Number of threads per block shall be larger than 64. + threads = std::max(64, threads); + int blocks = DivUp(DivUp(numel, vec_size), threads); + + GpuLaunchConfig config; + config.thread_per_block.x = threads; + config.block_per_grid.x = blocks; + config.compute_capability = capability; + return config; +} + +inline GpuLaunchConfig GetGpuLaunchConfig2D(const pten::GPUContext& context, + int x_dim, + int y_dim) { + PADDLE_ENFORCE_GT(x_dim, + 0, + paddle::platform::errors::InvalidArgument( + "x dim number should greater than 0," + " but received value is: %d", + x_dim)); + PADDLE_ENFORCE_GT(y_dim, + 0, + paddle::platform::errors::InvalidArgument( + "y dim number should greater than 0," + " but received value is: %d", + y_dim)); + + const int kThreadsPerBlock = 256; + int block_cols = (std::min)(x_dim, kThreadsPerBlock); + int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1); + + int max_physical_threads = context.GetMaxPhysicalThreadCount(); + const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1); + + GpuLaunchConfig config; + // Noticed, block size is not align to 32, if needed do it yourself. + config.thread_per_block = dim3(block_cols, block_rows, 1); + + int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks); + int grid_y = + (std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1)); + + config.block_per_grid = dim3(grid_x, grid_y, 1); + return config; +} + +} // namespace gpu +} // namespace backends +} // namespace pten + +#endif diff --git a/paddle/pten/backends/gpu/gpu_types.h b/paddle/pten/backends/gpu/gpu_types.h new file mode 100644 index 0000000000000..72353294b29e0 --- /dev/null +++ b/paddle/pten/backends/gpu/gpu_types.h @@ -0,0 +1,73 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/backends/gpu/forwards.h" +#include "paddle/pten/backends/gpu/gpu_decls.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_HIP +#include "paddle/pten/backends/dynload/miopen.h" +#include "paddle/pten/backends/dynload/rocblas.h" +#else // PADDLE_WITH_CUDA +#include "paddle/pten/backends/dynload/cublas.h" +#include "paddle/pten/backends/dynload/cudnn.h" +#endif + +namespace pten { + +#ifdef PADDLE_WITH_HIP +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = ROCM_TYPE; + +#else // PADDLE_WITH_CDUA + +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = CUDA_TYPE; +#endif + +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); +DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); +DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); +DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, + cudnnTensorFormat_t, + miopenTensorFormat_t); +DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, + cudnnActivationMode_t, + miopenActivationMode_t); + +#undef DECLARE_TYPE_FOR_GPU + +#ifdef PADDLE_WITH_HIP +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ + constexpr auto GPU_CV = ROCM_CV; +#else // PADDLE_WITH_CUDA +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ + constexpr auto GPU_CV = CUDA_CV; +#endif + +DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, + cudaErrorMemoryAllocation, + hipErrorOutOfMemory); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); + +#undef DECLARE_CONSTANT_FOR_GPU +} // namespace pten + +#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/pten/backends/gpu/rocm/CMakeLists.txt b/paddle/pten/backends/gpu/rocm/CMakeLists.txt new file mode 100644 index 0000000000000..181f92cbfc31c --- /dev/null +++ b/paddle/pten/backends/gpu/rocm/CMakeLists.txt @@ -0,0 +1 @@ +hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda) diff --git a/paddle/pten/backends/gpu/rocm/rocm_helper.h b/paddle/pten/backends/gpu/rocm/rocm_helper.h new file mode 100644 index 0000000000000..3a16bfc5286ae --- /dev/null +++ b/paddle/pten/backends/gpu/rocm/rocm_helper.h @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace pten { +namespace backends { +namespace gpu { + +/* + * Summary: Grid stride looping macro in CUDA kernel + * + * [ Why need this macro? ] + * + * The original looping in CUDA kernel is: + * + * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + * i += blockDim.x * gridDim.x)` + * + * This for condition is risky. The value of `blockIdx.x * blockDim.x` + * may be large, such as over 1GB, the first iteration is no problem here, + * but when `i += blockDim.x * gridDim.x` is executed, the value of i + * will greater than INT_MAX and overflow becomes negative value, at + * this time, the cycle condition `i < (n)` is still satisfied, so it + * will cause illegal access to cuda memory. + * + * Here is a real example in ERINE, it will trigger above error. + * The related data are: + * - blockIdx.x = 2172938 + * - blockDim.x = 512 + * - blockIdx.x * blockDim.x = 1112543864 + * - INT_MAX = 2147483647 + * + * So we polish the for condition as follow, the int64_t __index__ will + * prevent overflow in the loop increment. + * + * Parameters: + * - i: loop index + * - num: total element numbers + * + * Examples: + * template + * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, + * const int d, const int remain) { + * CUDA_KERNEL_LOOP(index, num) { + * int idx_n = index / d; + * int idx_remain = index % remain; + * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; + * } + * } + * +*/ + +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += hipBlockDim_x * hipGridDim_x, i = __index__) + +} // namespace gpu +} // namespace backends +} // namespace pten diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc b/paddle/pten/backends/gpu/rocm/rocm_info.cc similarity index 72% rename from paddle/fluid/platform/device/gpu/rocm/rocm_info.cc rename to paddle/pten/backends/gpu/rocm/rocm_info.cc index 06dba8ce423ef..095acdf0769d4 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc +++ b/paddle/pten/backends/gpu/rocm/rocm_info.cc @@ -12,20 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include +#include "paddle/pten/backends/gpu/gpu_info.h" + +// TODO(pten): remove fluid headers. #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/lock_guard_ptr.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/platform/monitor.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" static std::once_flag g_device_props_size_init_flag; static std::vector> g_device_props_init_flags; static std::vector g_device_props; -namespace paddle { -namespace platform { +namespace pten { +namespace backends { +namespace gpu { + int DnnVersion() { if (!dynload::HasCUDNN()) return -1; size_t version_major, version_minor, version_patch; @@ -78,11 +78,13 @@ int GetGPUDeviceCount() { } int GetGPUComputeCapability(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int major, minor; auto major_error_code = hipDeviceGetAttribute( &major, hipDeviceAttributeComputeCapabilityMajor, id); @@ -95,22 +97,26 @@ int GetGPUComputeCapability(int id) { } int GetGPURuntimeVersion(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int runtime_version = 0; PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version)); return runtime_version; } int GetGPUDriverVersion(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int driver_version = 0; PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version)); return driver_version; @@ -119,11 +125,13 @@ int GetGPUDriverVersion(int id) { bool TensorCoreAvailable() { return false; } int GetGPUMultiProcessors(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id)); @@ -131,11 +139,13 @@ int GetGPUMultiProcessors(int id) { } int GetGPUMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute( &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id)); @@ -144,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { } int GetGPUMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id)); @@ -161,35 +173,37 @@ int GetCurrentDeviceId() { return device_id; } -dim3 GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( +std::array GetGpuMaxGridDimSize(int id) { + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); - dim3 ret; + id, + GetGPUDeviceCount())); + std::array ret; int size; auto error_code_x = hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); - ret.x = size; + ret[0] = size; auto error_code_y = hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); - ret.y = size; + ret[1] = size; auto error_code_z = hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); - ret.z = size; + ret[2] = size; return ret; } const gpuDeviceProp &GetDeviceProperties(int id) { std::call_once(g_device_props_size_init_flag, [&] { int gpu_num = 0; - gpu_num = platform::GetGPUDeviceCount(); + gpu_num = GetGPUDeviceCount(); g_device_props_init_flags.resize(gpu_num); g_device_props.resize(gpu_num); for (int i = 0; i < gpu_num; ++i) { @@ -198,16 +212,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) { }); if (id == -1) { - id = platform::GetCurrentDeviceId(); + id = GetCurrentDeviceId(); } if (id < 0 || id >= static_cast(g_device_props.size())) { - PADDLE_THROW(platform::errors::OutOfRange( + PADDLE_THROW(paddle::platform::errors::OutOfRange( "The device id %d is out of range [0, %d), where %d is the number of " "devices on this machine. Because the device id should be greater than " "or equal to zero and smaller than the number of gpus. Please input " "appropriate device again!", - id, static_cast(g_device_props.size()), + id, + static_cast(g_device_props.size()), static_cast(g_device_props.size()))); } @@ -220,32 +235,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) { void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_LT(id, + GetGPUDeviceCount(), + paddle::platform::errors::InvalidArgument( "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", - id, GetGPUDeviceCount())); + id, + GetGPUDeviceCount())); PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id)); } -void GpuMemcpyAsync(void *dst, const void *src, size_t count, - gpuMemcpyKind kind, gpuStream_t stream) { +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream) { PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream)); } -void GpuMemcpySync(void *dst, const void *src, size_t count, +void GpuMemcpySync(void *dst, + const void *src, + size_t count, gpuMemcpyKind kind) { PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind)); } -void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, - int src_device, size_t count, gpuStream_t stream) { +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream) { PADDLE_ENFORCE_GPU_SUCCESS( hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); } -void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, - int src_device, size_t count) { +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count) { PADDLE_ENFORCE_GPU_SUCCESS( hipMemcpyPeer(dst, dst_device, src, src_device, count)); } @@ -265,5 +291,7 @@ void GpuDestroyStream(gpuStream_t stream) { void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); } gpuError_t GpuGetLastError() { return hipGetLastError(); } -} // namespace platform -} // namespace paddle + +} // namespace gpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_context.cc b/paddle/pten/backends/xpu/xpu_context.cc index af4478662a53b..22e82b220d2a8 100644 --- a/paddle/pten/backends/xpu/xpu_context.cc +++ b/paddle/pten/backends/xpu/xpu_context.cc @@ -13,8 +13,11 @@ // limitations under the License. #include "paddle/pten/backends/xpu/xpu_context.h" + #include + #include "paddle/pten/api/ext/exception.h" +#include "paddle/pten/common/place.h" #include "xpu/runtime.h" #include "xpu/runtime_ex.h" @@ -24,12 +27,11 @@ namespace xpu = baidu::xpu::api; namespace pten { -struct XPUContext::XPUImpl { - void SetL3Cache() { +struct XPUContext::Impl { + void SetL3Cache(int l3_size = 14155776) { const int MAX_XPU_NUM = 16; static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; - int l3_size = 13.5 * 1024 * 1024; if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) { l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE")); } @@ -52,48 +54,28 @@ struct XPUContext::XPUImpl { } } - XPUImpl() { - context_ = xpu::create_context(); - xpu_version_ = backends::xpu::get_xpu_version(place_.device); - } - - explicit XPUImpl(XPUPlace place) : place_(place) { - backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); - - LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " - << static_cast(place_.device); - - context_ = xpu::create_context(); - xpu_version_ = backends::xpu::get_xpu_version(place_.device); - SetL3Cache(); - } + Impl() : place_(XPUPlace()) {} - // Users need to manage external resources. - explicit XPUImpl(const XPUContextResource& ctx_res, - const XPUPlace& place = XPUPlace(0)) - : res_(ctx_res), place_(place) { - context_ = res_.context; - xpu_version_ = backends::xpu::get_xpu_version(place_.device); - SetL3Cache(); - } + explicit Impl(const Place& place) : place_(place) {} - ~XPUImpl() { - if (res_.context == nullptr && context_ != nullptr) { + ~Impl() { + if (owned_ && context_ != nullptr) { xpu::destroy_context(context_); context_ = nullptr; } } - Place GetPlace() const { return place_; } - - backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; } + const Place& GetPlace() const { return place_; } xpu::Context* GetXContext() const { PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); return context_; } - xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; } + xpu::BKCLContext_t GetBkclContext() const { + PD_CHECK(bkcl_context_ != nullptr, "the xpu bkcl_context is nullptr."); + return bkcl_context_; + } void Wait() const { backends::xpu::SetXPUDeviceId(place_.GetDeviceId()); @@ -101,53 +83,41 @@ struct XPUContext::XPUImpl { xpu_wait(context_->xpu_stream); } - void SetXContext(xpu::Context* context) { - if (context == nullptr) { - return; - } - res_.context = context; - context_ = context; + void Init() { + owned_ = true; + backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); + LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " + << static_cast(place_.device); + context_ = xpu::create_context(); + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + SetL3Cache(); } + void SetXContext(xpu::Context* context) { context_ = context; } + void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; } - XPUContextResource res_; - XPUPlace place_; + bool owned_{false}; + Place place_; backends::xpu::XPUVersion xpu_version_; xpu::Context* context_{nullptr}; + // NOTE: Distributed communicator, distributed framework manages its // resources, XPUContext only holds references. xpu::BKCLContext_t bkcl_context_{nullptr}; }; -XPUContext::XPUContext() : DeviceContext() { - impl_ = std::make_unique(); -} +XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique()) {} -XPUContext::XPUContext(const XPUPlace& place) { - impl_ = std::make_unique(place); -} - -XPUContext::XPUContext(const XPUContext& other) : DeviceContext() { - impl_ = std::make_unique(); - impl_->SetXContext(other.x_context()); - impl_->SetBkclContext(other.bkcl_context()); -} - -XPUContext::XPUContext(XPUContext&& other) : DeviceContext() { - impl_ = std::move(other.impl_); -} +XPUContext::XPUContext(const XPUPlace& place) + : DeviceContext(), impl_(std::make_unique(place)) {} XPUContext::~XPUContext() = default; -XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() { - impl_ = std::make_unique(ctx_res); -} - -Place XPUContext::GetPlace() const { return impl_->GetPlace(); } +const Place& XPUContext::GetPlace() const { return impl_->GetPlace(); } backends::xpu::XPUVersion XPUContext::xpu_version() const { - return impl_->GetXpuVersion(); + return impl_->xpu_version_; } xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); } @@ -158,12 +128,16 @@ xpu::BKCLContext_t XPUContext::bkcl_context() const { void XPUContext::Wait() const { impl_->Wait(); } -void XPUContext::set_x_context(xpu::Context* context) { +void XPUContext::SetXContext(xpu::Context* context) { impl_->SetXContext(context); } -void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) { +void XPUContext::SetL3Cache(int l3_size) { impl_->SetL3Cache(l3_size); } + +void XPUContext::SetBkclContext(xpu::BKCLContext_t context) { impl_->SetBkclContext(context); } +void XPUContext::Init() { impl_->Init(); } + } // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_context.h b/paddle/pten/backends/xpu/xpu_context.h index 4ae5786211dd2..440d06a60fc5c 100644 --- a/paddle/pten/backends/xpu/xpu_context.h +++ b/paddle/pten/backends/xpu/xpu_context.h @@ -26,26 +26,15 @@ namespace xpu = baidu::xpu::api; namespace pten { -struct XPUContextResource { - xpu::Context* context{nullptr}; -}; - class XPUContext : public DeviceContext { public: - // NOTE: DeviceContext hold resources. Used in training scenarios. XPUContext(); explicit XPUContext(const XPUPlace&); - // NOTE: Share the same underlying resources, please ensure that resources are - // not released. - XPUContext(const XPUContext&); - - XPUContext(XPUContext&&); - virtual ~XPUContext(); - Place GetPlace() const override; + const Place& GetPlace() const override; backends::xpu::XPUVersion xpu_version() const; @@ -53,21 +42,28 @@ class XPUContext : public DeviceContext { // Return bkcl context. xpu::BKCLContext_t bkcl_context() const; + void SetBkclContext(xpu::BKCLContext_t context); // Wait for all operations completion in the stream. void Wait() const override; public: - // NOTE: External users manage resources. Used in inference scenarios. - explicit XPUContext(const XPUContextResource&); + // NOTE: DeviceContext hold resources. Used in training scenarios. + // The interface used by the training scene, DeviceContext will initialize + // all resources and delete them when destructing. + void Init(); - void set_x_context(xpu::Context*); + public: + // NOTE: External users manage resources. Used in inference scenarios. + // The Set interface is for inference only, DeviceContext will mark the + // resource as external, and will not delete any resource when destructing. + void SetXContext(xpu::Context*); - void set_bkcl_context(xpu::BKCLContext_t context); + void SetL3Cache(int l3_size = 14155776); private: - struct XPUImpl; - std::unique_ptr impl_; + struct Impl; + std::unique_ptr impl_; }; } // namespace pten diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc index d6e01c5c6e664..2a11b1bef9dbc 100644 --- a/paddle/pten/core/device_context.cc +++ b/paddle/pten/core/device_context.cc @@ -23,7 +23,7 @@ struct DeviceContext::Impl { Impl() = default; ~Impl() = default; - void SetDeviceAllocator(const Allocator* allocator) { + void SetAllocator(const Allocator* allocator) { PADDLE_ENFORCE_NOT_NULL( allocator, pten::errors::InvalidArgument( @@ -47,7 +47,7 @@ struct DeviceContext::Impl { zero_allocator_ = allocator; } - const Allocator& GetDeviceAllocator() const { + const Allocator& GetAllocator() const { PADDLE_ENFORCE_NOT_NULL( device_allocator_, pten::errors::InvalidArgument("Required device_allocator_ shall not be " @@ -124,7 +124,7 @@ DeviceContext::DeviceContext() { impl_ = std::make_unique(); } DeviceContext::DeviceContext(const DeviceContext& other) { impl_->SetHostAllocator(&other.GetHostAllocator()); - impl_->SetDeviceAllocator(&other.GetDeviceAllocator()); + impl_->SetAllocator(&other.GetAllocator()); impl_->SetZeroAllocator(&other.GetZeroAllocator()); } @@ -134,12 +134,12 @@ DeviceContext::DeviceContext(DeviceContext&& other) { DeviceContext::~DeviceContext() = default; -void DeviceContext::SetDeviceAllocator(const Allocator* allocator) { - impl_->SetDeviceAllocator(allocator); +void DeviceContext::SetAllocator(const Allocator* allocator) { + impl_->SetAllocator(allocator); } -const Allocator& DeviceContext::GetDeviceAllocator() const { - return impl_->GetDeviceAllocator(); +const Allocator& DeviceContext::GetAllocator() const { + return impl_->GetAllocator(); } void DeviceContext::SetHostAllocator(const Allocator* allocator) { diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h index 30be5cd22dd4e..68c16dc3a1964 100644 --- a/paddle/pten/core/device_context.h +++ b/paddle/pten/core/device_context.h @@ -60,7 +60,7 @@ class DeviceContext { * * @param allocator */ - void SetDeviceAllocator(const Allocator*); + void SetAllocator(const Allocator*); /** * @brief Set the host Allocator object. @@ -81,7 +81,7 @@ class DeviceContext { * * @return Allocator */ - const Allocator& GetDeviceAllocator() const; + const Allocator& GetAllocator() const; /** * @brief Get the const device-related Allocator object. @@ -114,7 +114,7 @@ class DeviceContext { // TODO(wilber): Just for the convenience of migrating the code, it will be // modified or removed later. - virtual Place GetPlace() const = 0; + virtual const Place& GetPlace() const = 0; // TODO(wilber): The fluid framework uses wait() in many places, how to delete // this API interface. virtual void Wait() const {} diff --git a/paddle/pten/kernels/funcs/cuda_kernel_config.h b/paddle/pten/kernels/funcs/cuda_kernel_config.h index 6c7c6d11713c3..483e58eedb4d1 100644 --- a/paddle/pten/kernels/funcs/cuda_kernel_config.h +++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/pten/backends/gpu/gpu_context.h" #ifdef __HIPCC__ #define ELEMENTWISE_BLOCK_SIZE 256 @@ -31,7 +32,7 @@ namespace funcs { * 2x~4x) than number of SMs. Hence, SM count is took into account within * this function to determine the right number of threads per block. */ -inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx, +inline int GetThreadsConfig(const pten::GPUContext &ctx, int64_t numel, int vec_size) { int threads = ELEMENTWISE_BLOCK_SIZE; diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h index 3f1651eeb276f..d102fd637181f 100644 --- a/paddle/pten/kernels/funcs/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -23,8 +23,8 @@ limitations under the License. */ #if defined(__NVCC__) || defined(__HIPCC__) #include "paddle/fluid/platform/aligned_vector.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/function_traits.h" +#include "paddle/pten/backends/gpu/gpu_launch_config.h" #include "paddle/pten/kernels/primitive/kernel_primitives.h" namespace kps = pten::kps; @@ -646,7 +646,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx, VecSize><<>>( ins_data, outs_data, numel, main_offset, func); #else - auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize); + auto gpu_config = + pten::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize); int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize * gpu_config.GetBlockSize(); auto stream = ctx.stream(); diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu index a7b7184487c96..09baa2c6e023a 100644 --- a/paddle/pten/kernels/funcs/transpose.cu +++ b/paddle/pten/kernels/funcs/transpose.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/ddim.h" diff --git a/paddle/pten/kernels/gpu/concat_and_split.h b/paddle/pten/kernels/gpu/concat_and_split.h index 66b21b5f51351..47022666564df 100644 --- a/paddle/pten/kernels/gpu/concat_and_split.h +++ b/paddle/pten/kernels/gpu/concat_and_split.h @@ -237,12 +237,11 @@ __global__ void SplitKernel(const T* input_data, SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); } -static inline void GetBlockDims( - const paddle::platform::CUDADeviceContext& context, - int64_t num_rows, - int64_t num_cols, - dim3* block_dims, - dim3* grid_dims) { +static inline void GetBlockDims(const pten::GPUContext& context, + int64_t num_rows, + int64_t num_cols, + dim3* block_dims, + dim3* grid_dims) { // Set the thread block and grid according to CurrentDeviceId const int kThreadsPerBlock = 1024; int block_cols = kThreadsPerBlock; diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu index 6c0e7b1fe65d3..d48a9fb1d774f 100644 --- a/paddle/pten/kernels/gpu/copy_kernel.cu +++ b/paddle/pten/kernels/gpu/copy_kernel.cu @@ -87,9 +87,7 @@ void Copy(const Context& dev_ctx, ctx_gpu_place)); auto stream = blocking ? nullptr - : reinterpret_cast( - dev_ctx) - .stream(); + : reinterpret_cast(dev_ctx).stream(); paddle::memory::Copy( dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (paddle::platform::is_cpu_place(src_place) && // NOLINT @@ -113,9 +111,7 @@ void Copy(const Context& dev_ctx, ctx_gpu_place)); auto stream = blocking ? nullptr - : reinterpret_cast( - dev_ctx) - .stream(); + : reinterpret_cast(dev_ctx).stream(); paddle::memory::Copy( dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT @@ -140,9 +136,7 @@ void Copy(const Context& dev_ctx, ctx_gpu_place.device)); auto stream = blocking ? nullptr - : reinterpret_cast( - dev_ctx) - .stream(); + : reinterpret_cast(dev_ctx).stream(); paddle::memory::Copy( dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT @@ -167,9 +161,7 @@ void Copy(const Context& dev_ctx, ctx_gpu_place.device)); auto stream = blocking ? nullptr - : reinterpret_cast( - dev_ctx) - .stream(); + : reinterpret_cast(dev_ctx).stream(); paddle::memory::Copy( dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT @@ -185,9 +177,7 @@ void Copy(const Context& dev_ctx, ctx_place)); auto stream = blocking ? nullptr - : reinterpret_cast( - dev_ctx) - .stream(); + : reinterpret_cast(dev_ctx).stream(); if (paddle::platform::is_same_place(src_place, dst_place)) { paddle::memory::Copy( dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index f988f5abdb1ca..947f969b077d0 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -2020,7 +2020,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, T, kps::AddFunctor, kps::IdentityFunctor>( - dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); } } // dy @@ -2038,7 +2038,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, T, kps::AddFunctor, kps::IdentityFunctor>( - dout, dy, kps::IdentityFunctor(), reduce_dims, stream); + ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); } } } @@ -2137,7 +2137,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, T, kps::AddFunctor, kps::IdentityFunctor>( - dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); } } // dy @@ -2161,7 +2161,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, T, kps::AddFunctor, kps::InverseFunctor>( - dout, dy, kps::InverseFunctor(), reduce_dims, stream); + ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); } } } diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index ca850d2c8c5ee..7a76a988dee25 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -1064,7 +1064,8 @@ template class ReduceOp, typename TransformOp> -void TensorReduceFunctorImpl(const pten::DenseTensor& x, +void TensorReduceFunctorImpl(const pten::GPUContext& dev_ctx, + const pten::DenseTensor& x, pten::DenseTensor* y, const TransformOp& transform, const std::vector& origin_reduce_dims, @@ -1088,13 +1089,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x, auto x_data = x.data(); auto y_data = y->data(); - auto* dev_ctx = static_cast( - paddle::platform::DeviceContextPool::Instance().Get(x.place())); if (config.reduce_num == 1) { std::vector inputs = {&x}; std::vector outputs = {y}; funcs::LaunchSameDimsElementwiseCudaKernel( - *dev_ctx, inputs, &outputs, transform); + dev_ctx, inputs, &outputs, transform); return; } @@ -1244,13 +1243,23 @@ void Reduce(const GPUContext& dev_ctx, data_t, ReduceOp, TransformOp>( - x, out, TransformOp(reduce_num), reduce_dims, stream); + dev_ctx, + x, + out, + TransformOp(reduce_num), + reduce_dims, + stream); })); } else { using MPType = typename kps::details::MPTypeTrait::Type; pten::kernels:: TensorReduceFunctorImpl>( - x, out, TransformOp(reduce_num), reduce_dims, stream); + dev_ctx, + x, + out, + TransformOp(reduce_num), + reduce_dims, + stream); } } } // namespace pten diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h index 87785a2b4778a..226cfd89b1327 100644 --- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h @@ -60,9 +60,11 @@ struct ReduceSumForMatmulGrad { DenseTensor* output, const std::vector& reduce_dims) { auto stream = dev_ctx.stream(); - kernels:: - TensorReduceFunctorImpl>( - input, output, kps::IdentityFunctor(), reduce_dims, stream); + kernels::TensorReduceFunctorImpl>( + dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims, stream); } }; #endif diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc index 48d2374b68b76..4d3adf86d166d 100644 --- a/paddle/pten/tests/api/test_matmul_api.cc +++ b/paddle/pten/tests/api/test_matmul_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/api/include/api.h" #include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/copy_kernel.h" @@ -122,7 +123,7 @@ TEST(API, matmul_cuda) { auto& pool = paddle::platform::DeviceContextPool::Instance(); auto place = paddle::platform::CUDAPlace(); - auto* dev_ctx = pool.GetByPlace(place); + auto* dev_ctx = static_cast(pool.GetByPlace(place)); pten::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get()); pten::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get()); diff --git a/paddle/pten/tests/api/test_sparse_utils_api.cc b/paddle/pten/tests/api/test_sparse_utils_api.cc index 3ab7a60dff19e..1ec025faedc2c 100644 --- a/paddle/pten/tests/api/test_sparse_utils_api.cc +++ b/paddle/pten/tests/api/test_sparse_utils_api.cc @@ -47,6 +47,7 @@ TEST(API, to_sparse_coo) { std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data); pten::CPUContext dev_ctx_cpu; + dev_ctx_cpu.Init(); // 1. test dense_to_sparse_coo paddle::experimental::Tensor x(dense_x); diff --git a/paddle/pten/tests/core/test_device_context.cc b/paddle/pten/tests/core/test_device_context.cc index c84c4a41a02bf..4fa8faabcfcee 100644 --- a/paddle/pten/tests/core/test_device_context.cc +++ b/paddle/pten/tests/core/test_device_context.cc @@ -25,43 +25,29 @@ limitations under the License. */ namespace pten { namespace tests { +class InferenceCPUContext : public CPUContext { + public: + void SetEigenDevice(Eigen::DefaultDevice* eigen_device) { + CPUContext::SetEigenDevice(eigen_device); + } +}; + TEST(DeviceContext, cpu_context) { std::cout << "test training scenarios" << std::endl; { pten::CPUContext ctx; + ctx.Init(); EXPECT_TRUE(ctx.eigen_device() != nullptr); } std::cout << "test inference scenarios" << std::endl; Eigen::DefaultDevice* device = new Eigen::DefaultDevice(); { - pten::CPUContextResource ctx_res{device}; - pten::CPUContext ctx(ctx_res); - EXPECT_TRUE(ctx.eigen_device() != nullptr); - } - { - pten::CPUContextResource ctx_res{nullptr}; - pten::CPUContext ctx(ctx_res); + InferenceCPUContext ctx; ctx.SetEigenDevice(device); EXPECT_TRUE(ctx.eigen_device() != nullptr); } delete device; - - std::cout << "test copy constructor" << std::endl; - { - pten::CPUContext ctx1; - pten::CPUContext ctx2(ctx1); - EXPECT_EQ(ctx1.eigen_device(), ctx2.eigen_device()); - } - - std::cout << "test move constructor" << std::endl; - { - pten::CPUContext ctx1 = pten::CPUContext(); - auto* eigen_device1 = ctx1.eigen_device(); - pten::CPUContext ctx2(std::move(ctx1)); - auto* eigen_device2 = ctx2.eigen_device(); - EXPECT_EQ(eigen_device1, eigen_device2); - } } } // namespace tests diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc index 2f9dab0fbef04..d6aac6d7db862 100644 --- a/paddle/pten/tests/kernels/test_cast_dev_api.cc +++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc @@ -50,10 +50,10 @@ TEST(DEV_API, cast) { } pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); pten::DataType out_dtype = pten::DataType::FLOAT64; // 2. test API diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc index ad2cf8c220e49..5e78545f6d6d9 100644 --- a/paddle/pten/tests/kernels/test_concat_dev_api.cc +++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc @@ -59,10 +59,10 @@ TEST(DEV_API, concat) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto out = pten::Concat(dev_ctx, inputs, 0); // 3. check result diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc index 27332d99be6a1..cac8370f67fbf 100644 --- a/paddle/pten/tests/kernels/test_conj_dev_api.cc +++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc @@ -46,10 +46,10 @@ TEST(DEV_API, conj) { } pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); // 2. test API auto out = pten::Conj(dev_ctx, dense_x); diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc index 76891028c4399..cd589142aed9b 100644 --- a/paddle/pten/tests/kernels/test_copy_dev_api.cc +++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc @@ -58,10 +58,10 @@ TEST(DEV_API, copy) { std::cout << typeid(a).name() << std::endl; // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get()); // 3. check result diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc index 14f323399cc1f..0e87fe554a2f4 100644 --- a/paddle/pten/tests/kernels/test_creation_dev_api.cc +++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc @@ -33,10 +33,10 @@ using DDim = pten::framework::DDim; TEST(DEV_API, empty) { // 1. create input pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); // 2. test API auto out = pten::Empty(dev_ctx, {3, 2}, pten::DataType::INT32); @@ -64,10 +64,10 @@ TEST(DEV_API, empty_like) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto out = pten::EmptyLike(dev_ctx, dense_x); // 3. check result @@ -84,10 +84,10 @@ TEST(DEV_API, full) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto out = pten::Full(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32); // 3. check result @@ -118,10 +118,10 @@ TEST(DEV_API, full_like) { float val = 1.0; pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); // 2. test API auto out = pten::FullLike(dev_ctx, dense_x, val); diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc index 3343ab1d4283a..051f05c7805ca 100644 --- a/paddle/pten/tests/kernels/test_dot_dev_api.cc +++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc @@ -60,10 +60,10 @@ TEST(DEV_API, dot) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto out = pten::Dot(dev_ctx, dense_x, dense_y); // 3. check result diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc index 5a44fb6dae870..20dd5ddb4a38c 100644 --- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc @@ -62,10 +62,10 @@ TEST(DEV_API, add) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto dense_out = pten::Add(dev_ctx, dense_x, dense_y); // 3. check result @@ -116,10 +116,10 @@ TEST(DEV_API, subtract) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto dense_out = pten::Subtract(dev_ctx, dense_x, dense_y); // 3. check result @@ -170,10 +170,10 @@ TEST(DEV_API, divide) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto dense_out = pten::Divide(dev_ctx, dense_x, dense_y); // 3. check result @@ -224,10 +224,10 @@ TEST(DEV_API, multiply) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto dense_out = pten::Multiply(dev_ctx, dense_x, dense_y); // 3. check result diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc index 749004f3d71e9..f9dc1df818e45 100644 --- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc +++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc @@ -56,10 +56,10 @@ TEST(DEV_API, flatten) { } int start_axis = 1, stop_axis = 2; pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); // 2. test API auto out = pten::Flatten(dev_ctx, dense_x, start_axis, stop_axis); diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc index 0dfbb54a496ae..b1c0b40eae775 100644 --- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc +++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc @@ -55,10 +55,10 @@ TEST(DEV_API, dot) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto out = Matmul(dev_ctx, dense_x, dense_y, false, false); // 3. check result diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc index 5360c711d0a7c..4b84a13115047 100644 --- a/paddle/pten/tests/kernels/test_mean_dev_api.cc +++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc @@ -49,10 +49,10 @@ TEST(DEV_API, mean) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto out = pten::Mean(dev_ctx, dense_x, dims, false); // 3. check result diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc index 95eda3ef2d49f..58004e718ea2a 100644 --- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc +++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc @@ -48,10 +48,10 @@ TEST(DEV_API, reshape) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); auto out = pten::Reshape(dev_ctx, dense_x, shape); // 3. check result std::vector expect_shape = {12, 3}; diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc index f27e1037d0471..ccad71711d610 100644 --- a/paddle/pten/tests/kernels/test_scale_dev_api.cc +++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc @@ -49,10 +49,11 @@ TEST(DEV_API, scale) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); + auto out = pten::Scale(dev_ctx, dense_x, scale, bias, bias_after_scale); @@ -92,10 +93,11 @@ TEST(DEV_API, scale_host) { // 2. test API pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); + auto out = pten::Scale(dev_ctx, dense_x, scale, bias, bias_after_scale); diff --git a/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc index e4841097520ba..967609e9a8c3c 100644 --- a/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc +++ b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/common/place.h" #include "paddle/pten/kernels/copy_kernel.h" #include "paddle/pten/kernels/sparse/sparse_utils_kernel.h" @@ -22,6 +24,8 @@ limitations under the License. */ #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" + namespace pten { namespace tests { @@ -38,9 +42,8 @@ inline void CheckResult( ASSERT_EQ(coo.nnz(), non_zero_num); #if defined(PADDLE_WITH_CUDA) - if (coo.place() == paddle::platform::CUDAPlace()) { - const auto* dev_ctx_cuda = - static_cast(dev_ctx); + if (coo.place() == pten::GPUPlace()) { + const auto* dev_ctx_cuda = static_cast(dev_ctx); DenseTensor indices( alloc.get(), DenseTensorMeta( @@ -86,6 +89,8 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, paddle::platform::CPUPlace()); pten::CPUContext dev_ctx_cpu; + dev_ctx_cpu.Init(); + // 1. test cpu auto cpu_sparse_out = sparse::DenseToSparseCoo(dev_ctx_cpu, dense_x, sparse_dim); @@ -98,9 +103,21 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, // 2. test cuda #if defined(PADDLE_WITH_CUDA) - paddle::platform::DeviceContextPool& pool = - paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace()); + // paddle::platform::DeviceContextPool& pool = + // paddle::platform::DeviceContextPool::Instance(); + // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace()); + pten::GPUContext dev_ctx_gpu; + dev_ctx_gpu.PartialInitWithoutAllocator(); + dev_ctx_gpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream()) + .get()); + dev_ctx_gpu.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(pten::CPUPlace()) + .get()); + dev_ctx_gpu.PartialInitWithAllocator(); + const auto cuda_alloc = std::make_shared( paddle::platform::CUDAPlace()); @@ -108,10 +125,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, cuda_alloc.get(), DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout())); - pten::Copy(*dev_ctx_cuda, dense_x, true, &d_dense_x); + pten::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x); auto sparse_out = - sparse::DenseToSparseCoo(*dev_ctx_cuda, d_dense_x, sparse_dim); - CheckResult(dev_ctx_cuda, + sparse::DenseToSparseCoo(dev_ctx_gpu, d_dense_x, sparse_dim); + CheckResult(&dev_ctx_gpu, sparse_out, non_zero_data, indices_data, @@ -295,20 +312,32 @@ void TestSparseCsrToCoo(const DDim& dense_dims, alloc); // 2. test cuda #if defined(PADDLE_WITH_CUDA) + pten::GPUContext dev_ctx_gpu; + dev_ctx_gpu.PartialInitWithoutAllocator(); + dev_ctx_gpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream()) + .get()); + dev_ctx_gpu.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(pten::CPUPlace()) + .get()); + dev_ctx_gpu.PartialInitWithAllocator(); + const auto cuda_alloc = std::make_shared( paddle::platform::CUDAPlace()); - auto& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace()); + // auto& pool = paddle::platform::DeviceContextPool::Instance(); + // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace()); pten::DenseTensor d_crows(cuda_alloc.get(), crows_meta); pten::DenseTensor d_cols(cuda_alloc.get(), cols_meta); pten::DenseTensor d_values(cuda_alloc.get(), values_meta); - pten::Copy(*dev_ctx_cuda, crows, true, &d_crows); - pten::Copy(*dev_ctx_cuda, cols, true, &d_cols); - pten::Copy(*dev_ctx_cuda, values, true, &d_values); + pten::Copy(dev_ctx_gpu, crows, true, &d_crows); + pten::Copy(dev_ctx_gpu, cols, true, &d_cols); + pten::Copy(dev_ctx_gpu, values, true, &d_values); pten::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims); - auto cuda_sparse_out = sparse::SparseCsrToCoo(*dev_ctx_cuda, d_csr); - CheckResult(dev_ctx_cuda, + auto cuda_sparse_out = sparse::SparseCsrToCoo(dev_ctx_gpu, d_csr); + CheckResult(&dev_ctx_gpu, cuda_sparse_out, non_zero_data, indices_data, diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc index 023d62c7ed9ff..b05d40692afb4 100644 --- a/paddle/pten/tests/kernels/test_sum_dev_api.cc +++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc @@ -47,10 +47,11 @@ TEST(DEV_API, sum) { std::vector axis = {0, 1}; pten::CPUContext dev_ctx; - dev_ctx.SetDeviceAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx.Init(); + // 2. test API auto out = pten::Sum(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false);