From 0efcae8676869d923eb3beca5259549e8b0776a0 Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Wed, 12 Jan 2022 21:09:38 +0800 Subject: [PATCH 01/24] [part 3]change type of function args (#38887) * code clean * [part 3]change type of function args --- .../fluid/operators/controlflow/bitwise_op.h | 30 ++++++------- .../operators/controlflow/compare_all_op.h | 2 +- .../fluid/operators/controlflow/compare_op.h | 12 +++--- .../fluid/operators/controlflow/logical_op.cu | 28 ++----------- .../fluid/operators/controlflow/logical_op.h | 42 ++++++++----------- 5 files changed, 44 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h index 92abe4cd3b1c3..9e652f9200747 100644 --- a/paddle/fluid/operators/controlflow/bitwise_op.h +++ b/paddle/fluid/operators/controlflow/bitwise_op.h @@ -22,19 +22,19 @@ limitations under the License. */ namespace paddle { namespace operators { -#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ - template \ - struct Bitwise##func##Functor { \ - using ELEM_TYPE = T; \ - HOSTDEVICE T operator()(const T& a, const T& b) const { return a expr b; } \ - }; \ - \ - template <> \ - struct Bitwise##func##Functor { \ - using ELEM_TYPE = bool; \ - HOSTDEVICE bool operator()(const bool& a, const bool& b) const { \ - return a bool_expr b; \ - } \ +#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ + template \ + struct Bitwise##func##Functor { \ + using ELEM_TYPE = T; \ + HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \ + }; \ + \ + template <> \ + struct Bitwise##func##Functor { \ + using ELEM_TYPE = bool; \ + HOSTDEVICE bool operator()(const bool a, const bool b) const { \ + return a bool_expr b; \ + } \ }; BITWISE_BINARY_FUNCTOR(And, &, &&) @@ -45,13 +45,13 @@ BITWISE_BINARY_FUNCTOR(Xor, ^, !=) template struct BitwiseNotFunctor { using ELEM_TYPE = T; - HOSTDEVICE T operator()(const T& a) const { return ~a; } + HOSTDEVICE T operator()(const T a) const { return ~a; } }; template <> struct BitwiseNotFunctor { using ELEM_TYPE = bool; - HOSTDEVICE bool operator()(const bool& a) const { return !a; } + HOSTDEVICE bool operator()(const bool a) const { return !a; } }; template diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h index bcad240601cf6..78a7b76e3fd9d 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.h +++ b/paddle/fluid/operators/controlflow/compare_all_op.h @@ -28,7 +28,7 @@ namespace operators { template struct EqualReduceFunctor { using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { + HOSTDEVICE bool operator()(const T a, const T b) const { if (std::is_floating_point::value) { // This branch will be optimized while compiling if T is integer. It is // safe to cast a and b to double. diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h index 36185322a96b8..d2ef4c9befba9 100644 --- a/paddle/fluid/operators/controlflow/compare_op.h +++ b/paddle/fluid/operators/controlflow/compare_op.h @@ -25,31 +25,31 @@ namespace operators { template struct LessThanFunctor { using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; } + HOSTDEVICE bool operator()(const T a, const T b) const { return a < b; } }; template struct LessEqualFunctor { using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; } + HOSTDEVICE bool operator()(const T a, const T b) const { return a <= b; } }; template struct GreaterThanFunctor { using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; } + HOSTDEVICE bool operator()(const T a, const T b) const { return a > b; } }; template struct GreaterEqualFunctor { using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; } + HOSTDEVICE bool operator()(const T a, const T b) const { return a >= b; } }; template struct EqualFunctor { using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { + HOSTDEVICE bool operator()(const T a, const T b) const { if (std::is_floating_point::value) { // This branch will be optimized while compiling if T is integer. It is // safe to cast a and b to double. @@ -63,7 +63,7 @@ struct EqualFunctor { template struct NotEqualFunctor { using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { + HOSTDEVICE bool operator()(const T a, const T b) const { return !EqualFunctor()(a, b); } }; diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu index 301b4c4149fad..4a3fc6c895174 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cu +++ b/paddle/fluid/operators/controlflow/logical_op.cu @@ -18,26 +18,6 @@ namespace plat = paddle::platform; namespace paddle { namespace operators { -#define LOGICAL_BINARY_FUNCTOR(func_name, op) \ - template \ - struct func_name { \ - using ELEMENT_TYPE = T; \ - HOSTDEVICE bool operator()(const T* args) const { \ - return static_cast(args[0]) op static_cast(args[1]); \ - } \ - }; - -LOGICAL_BINARY_FUNCTOR(CudaOrFunctor, ||) -LOGICAL_BINARY_FUNCTOR(CudaAndFunctor, &&) -LOGICAL_BINARY_FUNCTOR(CudaXorFunctor, ^) -#undef LOGICAL_BINARY_FUNCTOR - -template -struct CudaNotFunctor { - using ELEMENT_TYPE = T; - HOSTDEVICE bool operator()(const T* args) const { return !args[0]; } -}; - template class BinaryLogicalOpKernel : public framework::OpKernel { @@ -76,8 +56,8 @@ class BinaryLogicalOpKernel ops::BinaryLogicalOpKernel>, \ ops::BinaryLogicalOpKernel>); -REGISTER_LOGICAL_CUDA_KERNEL(logical_or, CudaOrFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_and, CudaAndFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, CudaXorFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_not, CudaNotFunctor) +REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor) +REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor) +REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor) +REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor) #undef REGISTER_LOGICAL_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h index 92fe0a10cb907..ee63da60fcd0f 100644 --- a/paddle/fluid/operators/controlflow/logical_op.h +++ b/paddle/fluid/operators/controlflow/logical_op.h @@ -19,38 +19,32 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct LogicalAndFunctor { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; } -}; +#define LOGICAL_BINARY_FUNCTOR(func_name, op) \ + template \ + struct func_name { \ + using ELEMENT_TYPE = T; \ + HOSTDEVICE bool operator()(const T a, const T b) const { \ + return static_cast(a) op static_cast(b); \ + } \ + }; -template -struct LogicalOrFunctor { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; } -}; +LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||) +LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&) +LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^) +#undef LOGICAL_BINARY_FUNCTOR template struct LogicalNotFunctor { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a) const { return !a; } -}; - -template -struct LogicalXorFunctor { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T& a, const T& b) const { - return (a || b) && !(a && b); - } + using ELEMENT_TYPE = T; + HOSTDEVICE bool operator()(const T a) const { return !a; } }; template class BinaryLogicalOpKernel - : public framework::OpKernel { + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; + using T = typename Functor::ELEMENT_TYPE; auto* x = context.Input("X"); auto* y = context.Input("Y"); auto* out = context.Output("Out"); @@ -62,10 +56,10 @@ class BinaryLogicalOpKernel template class UnaryLogicalOpKernel - : public framework::OpKernel { + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; + using T = typename Functor::ELEMENT_TYPE; auto* x = context.Input("X"); auto* out = context.Output("Out"); Functor unary_func; From 277cf900fb49a28e7d7818addbb863f2b62d3ef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Thu, 13 Jan 2022 10:23:12 +0800 Subject: [PATCH 02/24] splits allocation for pten, test=develop (#38853) --- paddle/fluid/framework/operator.h | 4 +- paddle/fluid/framework/tensor.cc | 8 -- paddle/fluid/framework/tensor.h | 8 -- paddle/fluid/framework/tensor_util.cc | 3 +- paddle/fluid/framework/tensor_util.h | 9 +- .../inference/api/details/zero_copy_tensor.cc | 7 +- paddle/fluid/inference/lite/tensor_utils.cc | 5 +- .../memory/allocation/aligned_allocator.cc | 14 ++- .../memory/allocation/aligned_allocator.h | 4 +- paddle/fluid/memory/allocation/allocator.cc | 9 +- paddle/fluid/memory/allocation/allocator.h | 105 ++++++----------- .../memory/allocation/allocator_facade.cc | 34 +++--- .../memory/allocation/allocator_facade.h | 1 + .../auto_growth_best_fit_allocator.cc | 11 +- .../auto_growth_best_fit_allocator.h | 8 +- .../auto_growth_best_fit_allocator_test.cc | 8 +- .../fluid/memory/allocation/base_ptr_test.cu | 8 +- .../memory/allocation/best_fit_allocator.cc | 6 +- .../memory/allocation/best_fit_allocator.h | 8 +- .../memory/allocation/buffered_allocator.cc | 7 +- .../memory/allocation/buffered_allocator.h | 4 +- .../allocation/buffered_allocator_test.cc | 6 +- .../fluid/memory/allocation/cpu_allocator.cc | 4 +- .../fluid/memory/allocation/cpu_allocator.h | 4 +- .../fluid/memory/allocation/cuda_allocator.cc | 4 +- .../fluid/memory/allocation/cuda_allocator.h | 4 +- .../cuda_device_context_allocator.h | 14 +-- .../allocation/cuda_virtual_mem_allocator.cc | 4 +- .../allocation/cuda_virtual_mem_allocator.h | 4 +- .../memory/allocation/locked_allocator.cc | 4 +- .../memory/allocation/locked_allocator.h | 4 +- .../allocation/naive_best_fit_allocator.cc | 4 +- .../allocation/naive_best_fit_allocator.h | 4 +- .../fluid/memory/allocation/npu_allocator.cc | 4 +- .../fluid/memory/allocation/npu_allocator.h | 4 +- .../memory/allocation/npu_pinned_allocator.cc | 8 +- .../memory/allocation/npu_pinned_allocator.h | 8 +- .../memory/allocation/pinned_allocator.cc | 4 +- .../memory/allocation/pinned_allocator.h | 4 +- .../memory/allocation/retry_allocator.cc | 4 +- .../fluid/memory/allocation/retry_allocator.h | 4 +- .../memory/allocation/retry_allocator_test.cc | 4 +- .../allocation/stream_safe_cuda_allocator.cc | 9 +- .../allocation/stream_safe_cuda_allocator.h | 8 +- .../allocation/test_aligned_allocator.cc | 4 +- .../allocation/thread_local_allocator.h | 4 +- ...l_memory_auto_growth_best_fit_allocator.cc | 8 +- ...al_memory_auto_growth_best_fit_allocator.h | 6 +- paddle/fluid/memory/malloc.h | 2 +- .../fluid/operators/math/concat_and_split.cu | 10 +- .../device/mlu/device_context_allocator.h | 6 +- .../fluid/platform/device/npu/npu_op_runner.h | 3 +- paddle/fluid/pybind/eager_functions.cc | 2 +- paddle/pten/api/lib/utils/CMakeLists.txt | 2 +- paddle/pten/api/lib/utils/allocator.cc | 23 ---- paddle/pten/api/lib/utils/allocator.h | 8 +- paddle/pten/api/lib/utils/storage.cc | 5 +- paddle/pten/api/lib/utils/tensor_utils.cc | 2 +- paddle/pten/core/allocator.h | 3 + paddle/pten/core/candidate/allocator.h | 107 ++++++++++++++++++ paddle/pten/core/dense_tensor.h | 2 + paddle/pten/core/storage.h | 1 + paddle/pten/tests/core/allocator.h | 7 +- paddle/pten/tests/core/test_allocator.cc | 4 + tools/check_file_diff_approvals.sh | 19 +--- 65 files changed, 328 insertions(+), 292 deletions(-) delete mode 100644 paddle/pten/api/lib/utils/allocator.cc create mode 100644 paddle/pten/core/candidate/allocator.h diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a46c83a2b3ad..09e4abc77f573 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -410,8 +410,8 @@ class ExecutionContext { auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T)); auto& deleter = tmp_allocation_ptr.get_deleter(); auto* allocation_ptr = tmp_allocation_ptr.release(); - auto shared_allocation = std::shared_ptr( - allocation_ptr, deleter); + auto shared_allocation = + std::shared_ptr(allocation_ptr, deleter); PADDLE_ENFORCE_GE( allocation_ptr->size(), framework::product(dim) * sizeof(T), diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index f11b37825d4f0..6aa10a058081b 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -17,14 +17,6 @@ limitations under the License. */ DECLARE_bool(use_stream_safe_cuda_allocator); -namespace paddle { -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -} // namespace paddle - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index e86009e9aafea..fcdb837bc80ce 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -32,14 +32,6 @@ limitations under the License. */ #include "paddle/pten/core/dense_tensor.h" -namespace paddle { -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -} // namespace paddle - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 7fd125834a0c3..5fd581220097b 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -151,8 +151,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation* allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation* allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent( allocation, reinterpret_cast(ctx).stream()); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 46eba6a1e41bb..11858e4166595 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -183,8 +183,7 @@ void TensorFromArray(const T* src, const size_t& array_size, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation* allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation* allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent( allocation, reinterpret_cast(ctx).stream()); @@ -241,8 +240,7 @@ void TensorFromVector(const std::vector& src, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation* allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation* allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent( allocation, reinterpret_cast(ctx).stream()); @@ -312,8 +310,7 @@ inline void TensorFromVector(const std::vector& src, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation* allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation* allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent( allocation, reinterpret_cast(ctx).stream()); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 01d4dbccd50ea..2f2f4c0ead760 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -223,9 +223,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, auto t_place = tensor->place(); paddle::framework::Tensor out; - auto mem_allocation = std::make_shared( - static_cast(data), ele_num * sizeof(T), - paddle::platform::CPUPlace()); + auto mem_allocation = + std::make_shared( + static_cast(data), ele_num * sizeof(T), + paddle::platform::CPUPlace()); out.ResetHolder(mem_allocation); if (paddle::platform::is_cpu_place(t_place)) { diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index b1e0eb5ef16ab..0d5cd29a0c579 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -257,9 +257,8 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) { size_t memory_size = GetLiteTensorNumel(*src) * framework::SizeOfType(GetNativePrecisionType(src->precision())); - std::shared_ptr holder( - new memory::allocation::Allocation(src_raw_data, memory_size, - GetNativePlace(src->target()))); + std::shared_ptr holder(new pten::Allocation( + src_raw_data, memory_size, GetNativePlace(src->target()))); dst->Resize(paddle::framework::make_ddim(src->shape())); SetLoD(dst->mutable_lod(), src->lod()); dst->ResetHolderWithType(holder, GetNativePrecisionType(src->precision())); diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index 10380c0d6028d..258cff32b4fca 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -23,7 +23,7 @@ namespace allocation { // For memory address alignment class AlignedAllocation : public Allocation { public: - AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) + AlignedAllocation(DecoratedAllocationPtr underlying_allocation, size_t offset) : Allocation( reinterpret_cast(underlying_allocation->ptr()) + offset, underlying_allocation->base_ptr(), @@ -32,7 +32,7 @@ class AlignedAllocation : public Allocation { underlying_allocation_(std::move(underlying_allocation)) {} private: - AllocationPtr underlying_allocation_; + DecoratedAllocationPtr underlying_allocation_; }; AlignedAllocator::AlignedAllocator( @@ -52,13 +52,17 @@ bool AlignedAllocator::IsAllocThreadSafe() const { return underlying_allocator_->IsAllocThreadSafe(); } -Allocation* AlignedAllocator::AllocateImpl(size_t size) { +pten::Allocation* AlignedAllocator::AllocateImpl(size_t size) { auto raw_allocation = underlying_allocator_->Allocate(size + alignment_); size_t offset = AlignedPtrOffset(raw_allocation->ptr(), alignment_); - return new AlignedAllocation(std::move(raw_allocation), offset); + auto* p = new AlignedAllocation( + static_unique_ptr_cast(std::move(raw_allocation)), offset); + return p; } -void AlignedAllocator::FreeImpl(Allocation* allocation) { delete allocation; } +void AlignedAllocator::FreeImpl(pten::Allocation* allocation) { + delete allocation; +} } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index 6fef5cae8d6af..ffd5ad0fae1b0 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -30,9 +30,9 @@ class AlignedAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - Allocation* AllocateImpl(size_t size) override; + pten::Allocation* AllocateImpl(size_t size) override; - void FreeImpl(Allocation* allocation) override; + void FreeImpl(pten::Allocation* allocation) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc index 4998f3dbb9613..0ef6f5cbab5cc 100644 --- a/paddle/fluid/memory/allocation/allocator.cc +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -18,11 +18,10 @@ namespace paddle { namespace memory { namespace allocation { -bool Allocator::IsAllocThreadSafe() const { return false; } - -void Allocator::FreeImpl(Allocation* allocation) { - Allocator* allocator = allocation->TopDecoratedAllocator(); - allocator->Free(allocation); +void Allocator::FreeImpl(pten::Allocation* allocation) { + static_cast(allocation) + ->TopDecoratedAllocator() + ->Free(allocation); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index ee802462ddc94..3f04d47516377 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/inlined_vector.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/pten/core/allocator.h" DECLARE_string(allocator_strategy); @@ -80,30 +81,19 @@ class Allocator; * e.g., something what is done in AlignedAllocator, etc. * In this case, we should declare a derived class of Allocation, which * contains an underlying Allocation allocated by the underlying allocator. - * Therefore, `decorated_allocators_` of the new Allocation object would + * Therefore, `decorated_allocators_` of the new Allocation object + * would * be a new chain, differing from the underlying Allocation object. */ -class Allocation { +class Allocation : public pten::Allocation { public: - inline Allocation(void* ptr, size_t size, platform::Place place) - : ptr_(ptr), base_ptr_(ptr), size_(size), place_(place) {} - inline Allocation(void* ptr, void* base_ptr, size_t size, - platform::Place place) - : ptr_(ptr), base_ptr_(base_ptr), size_(size), place_(place) {} - - Allocation(const Allocation& o) = delete; - Allocation& operator=(const Allocation& o) = delete; - Allocation(Allocation&& o) = delete; - Allocation& operator=(Allocation&& o) = delete; - - // Returns the holding pointer. - // NOTE: For performance consideration, it is better not to make this method - // as a virtual method. If we want to implement a `defragmentation` later, - // we might need to make `ptr_` field as a protected field, and add a virtual - // method like `defragmentation` to change `ptr_`. - inline void* ptr() const { return ptr_; } - - inline void* base_ptr() const { + Allocation(void* ptr, size_t size, platform::Place place) + : pten::Allocation(ptr, size, place), base_ptr_(ptr) {} + Allocation(void* ptr, void* base_ptr, size_t size, + const platform::Place& place) + : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {} + + void* base_ptr() const { PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth", paddle::platform::errors::Unimplemented( "base_ptr() is only implemented for auto_growth " @@ -112,21 +102,6 @@ class Allocation { return base_ptr_; } - // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the - // last valid element. - // - // NOTE: Some allocator might alloc more memory than request. The size - // could larger than its request. For example, - // the AlignedAllocator will always allocate memory as size + kAlignment. - // The raw pointer might not aligned, so an offset might be added to raw - // the pointer. The size of this allocation will be - // `size + kAlignemnt - offset`. - inline size_t size() const { return size_; } - - inline const platform::Place& place() const { return place_; } - - virtual ~Allocation() {} - private: inline void RegisterDecoratedAllocator(Allocator* allocator) { decorated_allocators_.emplace_back(allocator); @@ -139,10 +114,7 @@ class Allocation { } private: - void* ptr_; void* base_ptr_; // the point that directly requested from system - size_t size_; - platform::Place place_; /** * NOTE(zjl): Since decorated_allocators_ is usually a small vector. @@ -162,53 +134,42 @@ class Allocation { friend class Allocator; }; +using AllocationPtr = pten::Allocator::AllocationPtr; +using DecoratedAllocationPtr = + std::unique_ptr; + // Base interface class of memory Allocator. -class Allocator { +class Allocator : public pten::Allocator { public: - virtual ~Allocator() {} - - class AllocationDeleter { - public: - inline void operator()(Allocation* allocation) const { - Allocator* allocator = allocation->TopDecoratedAllocator(); - allocator->Free(allocation); - } - }; - - using AllocationPtr = std::unique_ptr; + static void AllocationDeleter(pten::Allocation* allocation) { + Allocator* allocator = + static_cast(allocation)->TopDecoratedAllocator(); + allocator->Free(allocation); + } // Allocate an allocation. // size may be 0, but it would be too complex if we handle size == 0 // in each Allocator. So we handle size == 0 inside AllocatorFacade // in our design. - inline AllocationPtr Allocate(size_t size) { + AllocationPtr Allocate(size_t size) override { auto ptr = AllocateImpl(size); - ptr->RegisterDecoratedAllocator(this); - return AllocationPtr(ptr); + static_cast(ptr)->RegisterDecoratedAllocator(this); + return AllocationPtr(ptr, AllocationDeleter); } - // This function should not be called outside Allocator class - inline void Free(Allocation* allocation) { - allocation->PopDecoratedAllocator(); + void Free(pten::Allocation* allocation) { + static_cast(allocation)->PopDecoratedAllocator(); FreeImpl(allocation); } - inline uint64_t Release(const platform::Place& place) { - return ReleaseImpl(place); - } - - // True if the `Allocate` is thread safe. - virtual bool IsAllocThreadSafe() const; + uint64_t Release(const platform::Place& place) { return ReleaseImpl(place); } protected: - virtual Allocation* AllocateImpl(size_t size) = 0; - virtual void FreeImpl(Allocation* allocation); + virtual pten::Allocation* AllocateImpl(size_t size) = 0; + virtual void FreeImpl(pten::Allocation* allocation); virtual uint64_t ReleaseImpl(const platform::Place& place) { return 0; } }; -using AllocationDeleter = Allocator::AllocationDeleter; -using AllocationPtr = Allocator::AllocationPtr; - inline size_t AlignedSize(size_t size, size_t alignment) { auto remaining = size % alignment; return remaining == 0 ? size : size + alignment - remaining; @@ -220,6 +181,14 @@ inline size_t AlignedPtrOffset(const void* ptr, size_t alignment) { return diff == 0 ? 0 : alignment - diff; } +template +decltype(auto) static_unique_ptr_cast(std::unique_ptr&& p) { + static_assert(std::is_base_of::value, + "Derived type must derive from Base."); + auto d = static_cast(p.release()); + return std::unique_ptr(d, p.get_deleter()); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 9bc2f5461f383..474b4fe3d4522 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -94,7 +94,7 @@ class CUDAGraphAllocator class PrivateAllocation : public Allocation { public: PrivateAllocation(CUDAGraphAllocator* allocator, - AllocationPtr underlying_allocation) + DecoratedAllocationPtr underlying_allocation) : Allocation( underlying_allocation->ptr(), underlying_allocation->base_ptr(), underlying_allocation->size(), underlying_allocation->place()), @@ -103,7 +103,7 @@ class CUDAGraphAllocator private: std::shared_ptr allocator_; - AllocationPtr underlying_allocation_; + DecoratedAllocationPtr underlying_allocation_; }; explicit CUDAGraphAllocator(const std::shared_ptr& allocator) @@ -116,12 +116,14 @@ class CUDAGraphAllocator } protected: - Allocation* AllocateImpl(size_t size) { + pten::Allocation* AllocateImpl(size_t size) { VLOG(10) << "Allocate " << size << " for CUDA Graph"; - return new PrivateAllocation(this, underlying_allocator_->Allocate(size)); + return new PrivateAllocation(this, + static_unique_ptr_cast( + underlying_allocator_->Allocate(size))); } - void FreeImpl(Allocation* allocation) { + void FreeImpl(pten::Allocation* allocation) { VLOG(10) << "delete for CUDA Graph"; delete allocation; } @@ -322,7 +324,7 @@ class AllocatorFacadePrivate { return static_cast(pool.Get(place))->stream(); } - void RecordStream(std::shared_ptr allocation, + void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream) { if (allocation->size() == 0) { return; @@ -339,7 +341,7 @@ class AllocatorFacadePrivate { } const gpuStream_t& GetStream( - const std::shared_ptr& allocation) const { + const std::shared_ptr& allocation) const { const StreamSafeCUDAAllocation* stream_safe_cuda_allocation = dynamic_cast(allocation.get()); PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, @@ -391,10 +393,10 @@ class AllocatorFacadePrivate { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation* AllocateImpl(size_t size) override { + pten::Allocation* AllocateImpl(size_t size) override { return new Allocation(nullptr, 0, place_); } - void FreeImpl(Allocation* allocation) override { delete allocation; } + void FreeImpl(pten::Allocation* allocation) override { delete allocation; } private: platform::Place place_; @@ -820,9 +822,9 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } -std::shared_ptr AllocatorFacade::AllocShared( +std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size) { - return std::shared_ptr(Alloc(place, size)); + return std::shared_ptr(Alloc(place, size)); } AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, @@ -866,7 +868,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ->Release(place); } -std::shared_ptr AllocatorFacade::AllocShared( +std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, const platform::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( @@ -884,14 +886,14 @@ std::shared_ptr AllocatorFacade::AllocShared( } #endif gpuStream_t s = reinterpret_cast(stream.id()); - return std::shared_ptr(Alloc(place, size, s)); + return std::shared_ptr(Alloc(place, size, s)); #else PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif } bool AllocatorFacade::InSameStream( - const std::shared_ptr& allocation, + const std::shared_ptr& allocation, const platform::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( @@ -962,7 +964,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, return m_->GetAllocator(place, stream)->Release(place); } -void AllocatorFacade::RecordStream(std::shared_ptr allocation, +void AllocatorFacade::RecordStream(std::shared_ptr allocation, const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, @@ -983,7 +985,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr allocation, } const gpuStream_t& AllocatorFacade::GetStream( - const std::shared_ptr& allocation) const { + const std::shared_ptr& allocation) const { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index d59ecaece5a70..76e2f0b5a94f6 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -42,6 +42,7 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator; class AllocatorFacadePrivate; class AllocatorFacade { public: + using Allocation = pten::Allocation; AllocatorFacade(const AllocatorFacade& o) = delete; const AllocatorFacade& operator=(const AllocatorFacade& o) = delete; ~AllocatorFacade(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index dd2a65d889d8d..ad62af8480f58 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -45,7 +45,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), allow_free_idle_chunk_(allow_free_idle_chunk) {} -Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { +pten::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( + size_t unaligned_size) { size_t size = AlignedSize(unaligned_size, alignment_); VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; @@ -78,11 +79,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { size_t realloc_size = std::max(size, chunk_size_); try { - chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size)); + chunks_.emplace_back(static_unique_ptr_cast( + underlying_allocator_->Allocate(realloc_size))); } catch (BadAlloc &ex) { if (FLAGS_free_when_no_cache_hit) throw ex; FreeIdleChunks(); - chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size)); + chunks_.emplace_back(static_unique_ptr_cast( + underlying_allocator_->Allocate(realloc_size))); } auto *chunk = &(*chunks_.rbegin()); @@ -104,7 +107,7 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { return new BlockAllocation(block_it); } -void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { +void AutoGrowthBestFitAllocator::FreeImpl(pten::Allocation *allocation) { VLOG(10) << "Free " << allocation->size() << " bytes, ptr = " << allocation->ptr(); std::lock_guard guard(spinlock_); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index 2334a1b6d4d55..94aff93ec50f8 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -36,9 +36,9 @@ class AutoGrowthBestFitAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation *AllocateImpl(size_t size) override; + pten::Allocation *AllocateImpl(size_t size) override; - void FreeImpl(Allocation *allocation) override; + void FreeImpl(pten::Allocation *allocation) override; // Release the memory block which is not used in pool. uint64_t ReleaseImpl(const platform::Place &place) override { @@ -64,10 +64,10 @@ class AutoGrowthBestFitAllocator : public Allocator { }; struct Chunk { - explicit Chunk(AllocationPtr allocation) + explicit Chunk(DecoratedAllocationPtr allocation) : allocation_(std::move(allocation)) {} - AllocationPtr allocation_; + DecoratedAllocationPtr allocation_; List blocks_; }; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 926af8292d2e8..5942fbe730e57 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -28,12 +28,12 @@ namespace allocation { class RecordedAllocator : public Allocator { protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { allocated_size_ += size; return new Allocation(malloc(size), size, platform::CPUPlace()); } - void FreeImpl(Allocation *allocation) { + void FreeImpl(pten::Allocation *allocation) { allocated_size_ -= allocation->size(); free(allocation->ptr()); delete allocation; @@ -79,7 +79,7 @@ class LimitedResourceAllocator : public Allocator { size_t AllocatedSize() const { return allocated_size_; } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { if (allocated_size_ + size > capacity_) { throw BadAlloc("", __FILE__, __LINE__); } @@ -88,7 +88,7 @@ class LimitedResourceAllocator : public Allocator { return new Allocation(malloc(size), size, platform::CPUPlace()); } - void FreeImpl(Allocation *allocation) { + void FreeImpl(pten::Allocation *allocation) { allocated_size_ -= allocation->size(); free(allocation->ptr()); delete allocation; diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/allocation/base_ptr_test.cu index a34750a5e34ba..5edabfcb9f5e7 100644 --- a/paddle/fluid/memory/allocation/base_ptr_test.cu +++ b/paddle/fluid/memory/allocation/base_ptr_test.cu @@ -37,7 +37,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { size_t size = dis_(random_engine_); AllocationPtr allocation = Alloc(place_, size); - void* base_ptr = allocation->base_ptr(); + void* base_ptr = static_cast(allocation.get())->base_ptr(); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); @@ -56,7 +56,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { size_t size = dis_(random_engine_); AllocationPtr allocation = Alloc(place_, size); - void* base_ptr = allocation->base_ptr(); + void* base_ptr = static_cast(allocation.get())->base_ptr(); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); @@ -77,7 +77,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { size_t size = dis_(random_engine_); AllocationPtr allocation = Alloc(place_, size); - void* base_ptr = allocation->base_ptr(); + void* base_ptr = static_cast(allocation.get())->base_ptr(); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); @@ -91,7 +91,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { void ZeroSizeAllocTest() { AllocationPtr allocation = Alloc(place_, 0); - void* base_ptr = allocation->base_ptr(); + void* base_ptr = static_cast(allocation.get())->base_ptr(); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index 0955b5212622f..3cba70bd5b502 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -33,7 +33,7 @@ static int HighestBitPos(size_t N) { } } -BestFitAllocator::BestFitAllocator(Allocation* allocation) +BestFitAllocator::BestFitAllocator(pten::Allocation* allocation) : allocation_(allocation) { details::Chunk chunk; chunk.size_ = allocation_->size(); @@ -115,7 +115,7 @@ size_t BestFitAllocator::NumFreeChunks() const { } return num; } -void BestFitAllocator::FreeImpl(Allocation* allocation) { +void BestFitAllocator::FreeImpl(pten::Allocation* allocation) { auto* bf_allocation = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL( bf_allocation, @@ -150,7 +150,7 @@ void BestFitAllocator::FreeImpl(Allocation* allocation) { InsertFreeNode(chunk_it); delete allocation; } -Allocation* BestFitAllocator::AllocateImpl(size_t size) { +pten::Allocation* BestFitAllocator::AllocateImpl(size_t size) { auto highest_set_bit = static_cast(HighestBitPos(size)); MapIt map_it; for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) { diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h index 42f69e6d704af..297d876178f3d 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -108,7 +108,7 @@ class BestFitAllocation : public Allocation { // the prev-chunk and the next-chunk when possible. class BestFitAllocator : public Allocator { public: - explicit BestFitAllocator(Allocation* allocation); + explicit BestFitAllocator(pten::Allocation* allocation); void* BasePtr() const { return allocation_->ptr(); } @@ -127,11 +127,11 @@ class BestFitAllocator : public Allocator { void InsertFreeNode(const ListIt& it); protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; private: - Allocation* allocation_; // not owned + pten::Allocation* allocation_; // not owned details::ChunkList chunks_; details::FreeChunkBin free_chunks_; }; diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc index 325cb010bf466..11739ebba955f 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator.cc @@ -46,12 +46,13 @@ void BufferedAllocator::FreeCache(size_t size) { bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; } -void BufferedAllocator::FreeImpl(Allocation *allocation) { +void BufferedAllocator::FreeImpl(pten::Allocation *allocation) { platform::LockGuardPtr guard(mtx_); - allocations_.emplace(allocation->size(), AllocationPtr(allocation)); + allocations_.emplace(allocation->size(), + AllocationPtr(allocation, Allocator::AllocationDeleter)); } -Allocation *BufferedAllocator::AllocateImpl(size_t size) { +pten::Allocation *BufferedAllocator::AllocateImpl(size_t size) { { platform::LockGuardPtr guard(mtx_); auto it = allocations_.lower_bound(size); diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h index 5e1733bd839de..0ccccef573963 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.h +++ b/paddle/fluid/memory/allocation/buffered_allocator.h @@ -45,8 +45,8 @@ class BufferedAllocator : public Allocator { void FreeCache(size_t size); protected: - void FreeImpl(Allocation *allocation) override; - Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc index 0bfa10a1616b6..21c30efccd8ad 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc @@ -27,7 +27,7 @@ namespace memory { namespace allocation { inline std::unique_ptr GetBufferedAllocator( - Allocation *allocation, bool thread_safe) { + pten::Allocation *allocation, bool thread_safe) { std::unique_ptr allocator(new BestFitAllocator(allocation)); if (thread_safe) { allocator.reset(new LockedAllocator(std::move(allocator))); @@ -68,7 +68,7 @@ class StubAllocator : public Allocator { size_t GetFreeCount() const { return destruct_count_; } protected: - void FreeImpl(Allocation *allocation) override { + void FreeImpl(pten::Allocation *allocation) override { auto *alloc = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL( alloc, platform::errors::InvalidArgument( @@ -77,7 +77,7 @@ class StubAllocator : public Allocator { ++destruct_count_; delete allocation; } - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { ++construct_count_; if (size == 0) { return new StubAllocation(nullptr, 0, platform::CPUPlace()); diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc index 128591f5a8d3e..bf0bd891be26f 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.cc +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -24,7 +24,7 @@ namespace allocation { bool CPUAllocator::IsAllocThreadSafe() const { return true; } -void CPUAllocator::FreeImpl(Allocation *allocation) { +void CPUAllocator::FreeImpl(pten::Allocation *allocation) { void *p = allocation->ptr(); #ifdef _WIN32 _aligned_free(p); @@ -34,7 +34,7 @@ void CPUAllocator::FreeImpl(Allocation *allocation) { delete allocation; } -Allocation *CPUAllocator::AllocateImpl(size_t size) { +pten::Allocation *CPUAllocator::AllocateImpl(size_t size) { void *p; #ifdef _WIN32 p = _aligned_malloc(size, kAlignment); diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 058ff63381658..a64089dd2de42 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -37,8 +37,8 @@ class CPUAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; }; } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 4242083f2e617..ff9bbf4ab3df8 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -32,7 +32,7 @@ namespace paddle { namespace memory { namespace allocation { bool CUDAAllocator::IsAllocThreadSafe() const { return true; } -void CUDAAllocator::FreeImpl(Allocation* allocation) { +void CUDAAllocator::FreeImpl(pten::Allocation* allocation) { PADDLE_ENFORCE_EQ( BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, platform::errors::PermissionDenied( @@ -42,7 +42,7 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) { delete allocation; } -Allocation* CUDAAllocator::AllocateImpl(size_t size) { +pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) { std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); }); void* ptr; diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h index 5969d4d20ddee..57e85a3dc21d1 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -28,8 +28,8 @@ class CUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; private: platform::CUDAPlace place_; diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 33cf2fe054247..a6696634c12d4 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -41,7 +41,7 @@ namespace allocation { */ class CUDADeviceContextAllocation : public Allocation { public: - explicit CUDADeviceContextAllocation(AllocationPtr allocation) + explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation) : Allocation(allocation->ptr(), allocation->base_ptr(), allocation->size(), allocation->place()), underlying_allocation_(std::move(allocation)) {} @@ -56,7 +56,7 @@ class CUDADeviceContextAllocation : public Allocation { << p_allocation; dev_ctx_->AddStreamCallback([p_allocation] { VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation; - AllocationDeleter()(p_allocation); + Allocator::AllocationDeleter(p_allocation); }); } @@ -65,7 +65,7 @@ class CUDADeviceContextAllocation : public Allocation { } private: - AllocationPtr underlying_allocation_; + DecoratedAllocationPtr underlying_allocation_; const platform::CUDADeviceContext *dev_ctx_{nullptr}; }; @@ -102,14 +102,14 @@ class CUDADeviceContextAllocator : public Allocator { } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { PADDLE_ENFORCE_NOT_NULL( default_stream_, platform::errors::PreconditionNotMet( "Default stream is not set for CUDADeviceContextAllocator")); platform::CUDADeviceGuard guard(place_.device); - auto allocation = - new CUDADeviceContextAllocation(memory::Alloc(place_, size)); + auto allocation = new CUDADeviceContextAllocation( + static_unique_ptr_cast(memory::Alloc(place_, size))); // Wait for the event on stream #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_)); @@ -121,7 +121,7 @@ class CUDADeviceContextAllocator : public Allocator { return allocation; } - void FreeImpl(Allocation *allocation) override { delete allocation; } + void FreeImpl(pten::Allocation *allocation) override { delete allocation; } private: platform::CUDAPlace place_; diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc index f4baca8288f03..2ae2cf20ee6d4 100644 --- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc @@ -101,7 +101,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator( bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; } -void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) { +void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) { PADDLE_ENFORCE_EQ( BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, platform::errors::PermissionDenied( @@ -140,7 +140,7 @@ void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) { delete allocation; } -Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) { +pten::Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) { size = AlignedSize(size, granularity_); CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_; diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h index c51b56566bb02..0e1e59d200d91 100644 --- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h @@ -37,8 +37,8 @@ class CUDAVirtualMemAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; private: platform::CUDAPlace place_; diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index 6e8f870b235ff..a0c8efddbd80d 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -37,12 +37,12 @@ LockedAllocator::LockedAllocator( } } -void LockedAllocator::FreeImpl(Allocation *allocation) { +void LockedAllocator::FreeImpl(pten::Allocation *allocation) { platform::LockGuardPtr guard(mtx_); underlying_allocator_->Free(allocation); } -Allocation *LockedAllocator::AllocateImpl(size_t size) { +pten::Allocation *LockedAllocator::AllocateImpl(size_t size) { platform::LockGuardPtr guard(mtx_); return underlying_allocator_->Allocate(size).release(); } diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h index 1b8418bc8494a..d17c8b24e27bd 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.h +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -29,8 +29,8 @@ class LockedAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; - Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 8710bbe6ce98b..ffe7ccf9190be 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -790,7 +790,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { namespace allocation { -Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { +pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); auto *tmp_alloc = new Allocation(ptr, size, place_); platform::MemEvenRecorder::Instance().PushMemRecord( @@ -798,7 +798,7 @@ Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { return tmp_alloc; } -void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) { +void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) { boost::apply_visitor( legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place()); diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h index 474a308a064fd..b7b3647ff98c1 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h @@ -34,8 +34,8 @@ class NaiveBestFitAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation *AllocateImpl(size_t size) override; - void FreeImpl(Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; uint64_t ReleaseImpl(const platform::Place &place) override; private: diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc index 074a900cf5463..d9fa7ec27fdde 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.cc +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -22,7 +22,7 @@ namespace memory { namespace allocation { bool NPUAllocator::IsAllocThreadSafe() const { return true; } -void NPUAllocator::FreeImpl(Allocation* allocation) { +void NPUAllocator::FreeImpl(pten::Allocation* allocation) { PADDLE_ENFORCE_EQ( BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_, platform::errors::PermissionDenied( @@ -32,7 +32,7 @@ void NPUAllocator::FreeImpl(Allocation* allocation) { delete allocation; } -Allocation* NPUAllocator::AllocateImpl(size_t size) { +pten::Allocation* NPUAllocator::AllocateImpl(size_t size) { std::call_once(once_flag_, [this] { platform::SetNPUDeviceId(place_.device); }); diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h index bf668973505ba..88b0c9a24bb3d 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.h +++ b/paddle/fluid/memory/allocation/npu_allocator.h @@ -28,8 +28,8 @@ class NPUAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; private: platform::NPUPlace place_; diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc index 292fe15c5d952..2389973fa9b88 100644 --- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc @@ -26,7 +26,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() { platform::NPUEventQuery(event, &status); if (status == ACL_EVENT_STATUS_COMPLETE) { - Allocation *allocation = it->first; + auto *allocation = it->first; void *ptr = allocation->ptr(); free(ptr); npu_events_.erase(it++); @@ -38,7 +38,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() { } } -Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { +pten::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { std::lock_guard lock(mtx_); ProcessEventsAndFree(); void *ptr; @@ -50,7 +50,7 @@ Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { return new Allocation(ptr, size, platform::NPUPinnedPlace()); } -void NPUPinnedAllocator::FreeImpl(Allocation *allocation) { +void NPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) { std::lock_guard lock(mtx_); void *ptr = allocation->ptr(); auto iter = npu_events_.find(allocation); @@ -83,7 +83,7 @@ uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) { return static_cast(0); } -void NPUPinnedAllocator::RecordEvent(Allocation *allocation, +void NPUPinnedAllocator::RecordEvent(pten::Allocation *allocation, aclrtStream stream) { std::lock_guard lock(mtx_); aclrtEvent event = nullptr; diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h index 1d3f8bf1e449d..716b12eea15f8 100644 --- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h @@ -32,16 +32,16 @@ class NPUPinnedAllocator : public Allocator { public: bool IsAllocThreadSafe() const override { return true; } void ProcessEventsAndFree(); - void RecordEvent(Allocation *allocation, aclrtStream stream); + void RecordEvent(pten::Allocation *allocation, aclrtStream stream); constexpr static size_t kAlignment = 4096UL; protected: - Allocation *AllocateImpl(size_t size) override; - void FreeImpl(Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; uint64_t ReleaseImpl(const platform::Place &place) override; private: - std::unordered_map npu_events_; + std::unordered_map npu_events_; mutable std::mutex mtx_; }; diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index c56a7235c109c..f1175fc4374e7 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -18,7 +18,7 @@ namespace paddle { namespace memory { namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } -void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { +void CPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); #else @@ -26,7 +26,7 @@ void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { #endif delete allocation; } -Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { +pten::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { void *ptr; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index 4f535ef33734a..800e3ff3bb2e3 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -25,8 +25,8 @@ class CPUPinnedAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; - Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 1607af3808b43..856b6c2e9a2b0 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -39,7 +39,7 @@ class WaitedAllocateSizeGuard { size_t requested_size_; }; -void RetryAllocator::FreeImpl(Allocation* allocation) { +void RetryAllocator::FreeImpl(pten::Allocation* allocation) { // Delete underlying allocation first. size_t size = allocation->size(); underlying_allocator_->Free(allocation); @@ -51,7 +51,7 @@ void RetryAllocator::FreeImpl(Allocation* allocation) { } } -Allocation* RetryAllocator::AllocateImpl(size_t size) { +pten::Allocation* RetryAllocator::AllocateImpl(size_t size) { auto alloc_func = [&, this]() { return underlying_allocator_->Allocate(size).release(); }; diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 031a5e2b97f17..b427a37907a67 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -45,8 +45,8 @@ class RetryAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; uint64_t ReleaseImpl(const platform::Place& place) override { return underlying_allocator_->Release(place); } diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index 787f3d9dca377..d636c73e07a18 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -98,12 +98,12 @@ class DummyAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "Here is a test exception, always BadAlloc.")); } - void FreeImpl(Allocation *) override {} + void FreeImpl(pten::Allocation *) override {} }; TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index a4f766f1d1abc..05c6a7adaff8b 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -19,7 +19,7 @@ namespace memory { namespace allocation { StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( - AllocationPtr underlying_allocation, gpuStream_t owning_stream) + DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream) : Allocation(underlying_allocation->ptr(), underlying_allocation->base_ptr(), underlying_allocation->size(), underlying_allocation->place()), @@ -116,7 +116,7 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } -Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { +pten::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { ProcessUnfreedAllocations(); VLOG(8) << "Try allocate " << size << " bytes"; AllocationPtr underlying_allocation; @@ -136,13 +136,14 @@ Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { throw; } StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( - std::move(underlying_allocation), default_stream_); + static_unique_ptr_cast(std::move(underlying_allocation)), + default_stream_); VLOG(8) << "Allocate " << allocation->size() << " bytes at address " << allocation->ptr(); return allocation; } -void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) { +void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) { StreamSafeCUDAAllocation* stream_safe_cuda_allocation = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index d84994f58a9c4..f54cdc749611a 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -34,7 +34,7 @@ namespace allocation { class StreamSafeCUDAAllocation : public Allocation { public: - StreamSafeCUDAAllocation(AllocationPtr underlying_allocation, + StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream); void RecordStream(const gpuStream_t &stream); bool CanBeFreed(); @@ -42,7 +42,7 @@ class StreamSafeCUDAAllocation : public Allocation { const gpuStream_t &GetOwningStream() const; private: - AllocationPtr underlying_allocation_; + DecoratedAllocationPtr underlying_allocation_; std::map outstanding_event_map_; gpuStream_t owning_stream_; SpinLock outstanding_event_map_lock_; @@ -57,8 +57,8 @@ class StreamSafeCUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - Allocation *AllocateImpl(size_t size) override; - void FreeImpl(Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; uint64_t ReleaseImpl(const platform::Place &place) override; private: diff --git a/paddle/fluid/memory/allocation/test_aligned_allocator.cc b/paddle/fluid/memory/allocation/test_aligned_allocator.cc index 3eb1f140edd84..987c7ea772d23 100644 --- a/paddle/fluid/memory/allocation/test_aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/test_aligned_allocator.cc @@ -32,12 +32,12 @@ struct StubAllocator : public Allocator { size_t AllocNum() const { return alloc_num_; } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { ++alloc_num_; return new Allocation(new uint8_t[size], size, platform::CPUPlace()); } - void FreeImpl(Allocation *allocation) override { + void FreeImpl(pten::Allocation *allocation) override { delete[] static_cast(allocation->ptr()); delete allocation; --alloc_num_; diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h index c55f579981b00..9c9306517021a 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.h +++ b/paddle/fluid/memory/allocation/thread_local_allocator.h @@ -83,11 +83,11 @@ class ThreadLocalCUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation* AllocateImpl(size_t size) override { + pten::Allocation* AllocateImpl(size_t size) override { return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl( size); } - void FreeImpl(Allocation* allocation) override { + void FreeImpl(pten::Allocation* allocation) override { auto* tl_allocation = static_cast(allocation); auto allocator_impl = tl_allocation->GetAllocator(); allocator_impl->FreeImpl(tl_allocation); diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc index 5c7e8e2d933f3..face27debe9ff 100644 --- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc @@ -35,7 +35,8 @@ VirtualMemoryAutoGrowthBestFitAllocator:: alignment_(alignment), place_(place) {} -Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) { +pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl( + size_t size) { std::lock_guard guard(spinlock_); size = AlignedSize(size, alignment_); auto result = AllocFromFreeBlocks(size); @@ -48,7 +49,8 @@ Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) { return result; } -void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { +void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl( + pten::Allocation *allocation) { std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; TryMergeBlock2Blocks(block_it); @@ -225,7 +227,7 @@ void VirtualMemoryAutoGrowthBestFitAllocator::ExtendAndMerge(size_t size) { } } -Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks( +pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks( size_t size) { auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); if (iter != free_blocks_.end()) { diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h index 5171e5b3cd1bf..10bf0bbf49d5a 100644 --- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h @@ -60,12 +60,12 @@ class VirtualMemoryAutoGrowthBestFitAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation *AllocateImpl(size_t size) override; + pten::Allocation *AllocateImpl(size_t size) override; - void FreeImpl(Allocation *allocation) override; + void FreeImpl(pten::Allocation *allocation) override; private: - Allocation *AllocFromFreeBlocks(size_t size); + pten::Allocation *AllocFromFreeBlocks(size_t size); void ExtendAndMerge(size_t size); void TryMergeBlock2Blocks(std::list::iterator iter); diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 7069fb46203d6..8830c46a17798 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -28,7 +28,7 @@ class DeviceContext; namespace memory { -using allocation::Allocation; +using pten::Allocation; using allocation::Allocator; using allocation::AllocationPtr; diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index bc2d496a3e76a..6892f7ce4e503 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -336,9 +336,8 @@ class ConcatFunctor { auto* data_alloc_released = data_alloc.release(); auto* col_alloc_released = col_alloc.release(); context.AddStreamCallback([data_alloc_released, col_alloc_released] { - memory::allocation::AllocationDeleter deleter; - deleter(data_alloc_released); - deleter(col_alloc_released); + memory::allocation::Allocator::AllocationDeleter(data_alloc_released); + memory::allocation::Allocator::AllocationDeleter(col_alloc_released); }); #endif } @@ -466,9 +465,8 @@ class SplitFunctor { auto* data_alloc_released = data_alloc.release(); auto* cols_alloc_released = cols_alloc.release(); context.AddStreamCallback([data_alloc_released, cols_alloc_released] { - memory::allocation::AllocationDeleter deleter; - deleter(data_alloc_released); - deleter(cols_alloc_released); + memory::allocation::Allocator::AllocationDeleter(data_alloc_released); + memory::allocation::Allocator::AllocationDeleter(cols_alloc_released); }); #endif } diff --git a/paddle/fluid/platform/device/mlu/device_context_allocator.h b/paddle/fluid/platform/device/mlu/device_context_allocator.h index 9deab92af5cd6..408016c0f0d99 100644 --- a/paddle/fluid/platform/device/mlu/device_context_allocator.h +++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h @@ -55,7 +55,7 @@ class MLUDeviceContextAllocation : public Allocation { << p_allocation; dev_ctx_->AddStreamCallback([p_allocation] { VLOG(4) << "Delete MLUDeviceContextAllocation at " << p_allocation; - AllocationDeleter()(p_allocation); + Allocator::AllocationDeleter(p_allocation); }); } @@ -91,7 +91,7 @@ class MLUDeviceContextAllocator : public Allocator { } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { PADDLE_ENFORCE_NOT_NULL( default_stream_, platform::errors::PreconditionNotMet( @@ -105,7 +105,7 @@ class MLUDeviceContextAllocator : public Allocator { return allocation; } - void FreeImpl(Allocation *allocation) override { delete allocation; } + void FreeImpl(pten::Allocation *allocation) override { delete allocation; } private: platform::MLUPlace place_; diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h index e83057e682fef..c049da3b33566 100644 --- a/paddle/fluid/platform/device/npu/npu_op_runner.h +++ b/paddle/fluid/platform/device/npu/npu_op_runner.h @@ -158,8 +158,7 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation *allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation *allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream()); } else { diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 3f8923440be50..659df6b9b44de 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -53,7 +53,7 @@ size_t PyArray_Size_(PyObject* numpy_data) { return res; } -class EagerNumpyAllocation : public paddle::memory::allocation::Allocation { +class EagerNumpyAllocation : public pten::Allocation { public: explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype) : Allocation( diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt index 4a44ad7758b56..a4db8c4b193b6 100644 --- a/paddle/pten/api/lib/utils/CMakeLists.txt +++ b/paddle/pten/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ -cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS +cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits) diff --git a/paddle/pten/api/lib/utils/allocator.cc b/paddle/pten/api/lib/utils/allocator.cc deleted file mode 100644 index e80152431e712..0000000000000 --- a/paddle/pten/api/lib/utils/allocator.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/pten/api/lib/utils/allocator.h" - -namespace paddle { -namespace experimental { - -memory::Allocator::AllocationDeleter DefaultAllocator::deleter_; - -} // namespace experimental -} // namespace paddle diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h index 4f5a810e400ce..a8c05b7651689 100644 --- a/paddle/pten/api/lib/utils/allocator.h +++ b/paddle/pten/api/lib/utils/allocator.h @@ -22,14 +22,15 @@ limitations under the License. */ namespace paddle { namespace experimental { -class DefaultAllocator : public pten::Allocator { +class DefaultAllocator : public pten::deprecated::Allocator { public: - using Allocation = pten::Allocation; + using Allocation = pten::deprecated::Allocation; explicit DefaultAllocator(const paddle::platform::Place& place) : place_(place) {} static void Delete(Allocation* allocation) { - deleter_(allocation->CastContextWithoutCheck()); + paddle::memory::allocation::Allocator::AllocationDeleter( + allocation->CastContextWithoutCheck()); } Allocation Allocate(size_t bytes_size) override { @@ -42,7 +43,6 @@ class DefaultAllocator : public pten::Allocator { private: paddle::platform::Place place_; - static paddle::memory::Allocator::AllocationDeleter deleter_; }; } // namespace experimental diff --git a/paddle/pten/api/lib/utils/storage.cc b/paddle/pten/api/lib/utils/storage.cc index 9ee1b9e5b7f92..6116a709d5065 100644 --- a/paddle/pten/api/lib/utils/storage.cc +++ b/paddle/pten/api/lib/utils/storage.cc @@ -20,14 +20,13 @@ namespace experimental { ExternalStorage::ExternalStorage(void* ptr, size_t size, const paddle::platform::Place& place) - : pten::Storage( - std::make_shared(ptr, size, place)), + : pten::Storage(std::make_shared(ptr, size, place)), size_(size) {} ExternalStorage::ExternalStorage(const pten::intrusive_ptr& root, size_t delta, size_t size) - : Storage(std::make_shared( + : Storage(std::make_shared( static_cast(root->data()) + delta, size, root->place())), size_(size) { PADDLE_ENFORCE_LE(static_cast(delta + size), diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc index 69a1fc274a28d..0b6cb8d95cc1a 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.cc +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -307,7 +307,7 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) { dst->Resize(src->dims()); dst->set_type(pten::TransToProtoVarType(src->dtype())); auto storage = src->release(); - std::shared_ptr holder( + std::shared_ptr holder( new TensorStorage(std::move(storage))); dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype())); dst->set_offset(src->meta().offset); diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h index 74455be136834..2647490c9f58b 100644 --- a/paddle/pten/core/allocator.h +++ b/paddle/pten/core/allocator.h @@ -16,8 +16,10 @@ limitations under the License. */ #include #include "paddle/fluid/platform/place.h" +#include "paddle/pten/core/candidate/allocator.h" namespace pten { +namespace deprecated { /// \brief Encapsulates strategies for access/addressing, allocation/ /// deallocation and construction/destruction of objects. @@ -147,4 +149,5 @@ inline Allocation Allocate(const std::shared_ptr& a, size_t n) { return a->Allocate(n); } +} // namespace deprecated } // namespace pten diff --git a/paddle/pten/core/candidate/allocator.h b/paddle/pten/core/candidate/allocator.h new file mode 100644 index 0000000000000..75d42c4fd15cb --- /dev/null +++ b/paddle/pten/core/candidate/allocator.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/platform/place.h" + +namespace pten { + +/// \brief Fancy pointer with deleter. The use of this data type +/// is to be compatible with allocators from different frameworks +/// without significant performance loss. This class does not +/// support being inherited. +class Allocation { + public: + using Place = paddle::platform::Place; + using DeleterFnPtr = void (*)(Allocation*); + + Allocation() = default; + + // Don't own resources, only provide access. + Allocation(void* data, size_t size, const Place& place) + : ptr_(data), size_(size), place_(place) {} + + // Own resources. + Allocation(void* data, size_t size, DeleterFnPtr deleter, const Place& place) + : ptr_(data), size_(size), deleter_(deleter), place_(place) {} + + Allocation(Allocation&& other) noexcept { swap(*this, other); } + Allocation& operator=(Allocation&& other) noexcept { + // Exchange them explicitly to avoid moving is equivalent + // to copying. + swap(*this, other); + return *this; + } + + virtual ~Allocation() { + if (deleter_) { + deleter_(this); + } + } + + // Returns the holding pointer. + // NOTE: For performance consideration, it is better not to make this method + // as a virtual method. If we want to implement a `defragmentation` later, + // we might need to make `ptr_` field as a protected field, and add a virtual + // method like `defragmentation` to change `ptr_`. + void* ptr() const noexcept { return ptr_; } + + // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the + // last valid element. + // + // NOTE: Some allocator might alloc more memory than request. The size + // could larger than its request. For example, + // the AlignedAllocator will always allocate memory as size + kAlignment. + // The raw pointer might not aligned, so an offset might be added to raw + // the pointer. The size of this allocation will be + // `size + kAlignemnt - offset`. + size_t size() const noexcept { return size_; } + + void* operator->() const noexcept { return ptr_; } + operator bool() const noexcept { return ptr_; } + const Place& place() const noexcept { return place_; } + DeleterFnPtr deleter() const noexcept { return deleter_; } + + protected: + friend void swap(Allocation& a, Allocation& b) noexcept; + void* ptr_{nullptr}; + size_t size_{}; + DeleterFnPtr deleter_{nullptr}; + // TODO(Shixiaowei02): Enum needs to be used instead to reduce + // the construction overhead by more than 50%. + Place place_; +}; + +inline void swap(Allocation& a, Allocation& b) noexcept { + ::std::swap(a.ptr_, b.ptr_); + ::std::swap(a.deleter_, b.deleter_); + ::std::swap(a.place_, b.place_); + ::std::swap(a.size_, b.size_); +} + +class Allocator { + public: + using DeleterType = std::function; + using AllocationPtr = std::unique_ptr; + + virtual ~Allocator() = default; + virtual AllocationPtr Allocate(size_t bytes_size) = 0; + + virtual bool IsAllocThreadSafe() const { return false; } +}; + +} // namespace pten diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 1502accd197be..1802a2461158f 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -60,6 +60,8 @@ class TensorInplaceVersion { class DenseTensor : public TensorBase, public TypeInfoTraits { public: + using Allocator = deprecated::Allocator; + /// \brief Construct a dense tensor and allocate space. /// \param a The allocator used to allocate space. /// \param meta The meta data of dense tensor. diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h index fc56935eeaf19..cf18dd913093a 100644 --- a/paddle/pten/core/storage.h +++ b/paddle/pten/core/storage.h @@ -91,6 +91,7 @@ class Storage : public intrusive_ref_counter { class TensorStorage : public Storage { public: using Place = paddle::platform::Place; + using Allocator = deprecated::Allocator; explicit TensorStorage(const std::shared_ptr& a) : alloc_(a) {} diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h index 094c0e8437d98..c2c74e1aacf1f 100644 --- a/paddle/pten/tests/core/allocator.h +++ b/paddle/pten/tests/core/allocator.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace pten { namespace tests { -class HostAllocatorSample : public pten::RawAllocator { +class HostAllocatorSample : public pten::deprecated::RawAllocator { public: using Place = paddle::platform::Place; void* Allocate(size_t bytes_size) override { @@ -36,8 +36,9 @@ class HostAllocatorSample : public pten::RawAllocator { Place place_{paddle::platform::CPUPlace()}; }; -class FancyAllocator : public pten::Allocator { +class FancyAllocator : public pten::deprecated::Allocator { public: + using Allocation = pten::deprecated::Allocation; static void Delete(Allocation* allocation) { ::operator delete(allocation->ptr()); } @@ -55,7 +56,7 @@ class FancyAllocator : public pten::Allocator { template struct CustomAllocator { using value_type = T; - using Allocator = pten::RawAllocator; + using Allocator = pten::deprecated::RawAllocator; explicit CustomAllocator(const std::shared_ptr& a) noexcept : alloc_(a) {} diff --git a/paddle/pten/tests/core/test_allocator.cc b/paddle/pten/tests/core/test_allocator.cc index c509d8bd20a01..94ba9a1e1b9a2 100644 --- a/paddle/pten/tests/core/test_allocator.cc +++ b/paddle/pten/tests/core/test_allocator.cc @@ -24,6 +24,10 @@ limitations under the License. */ namespace pten { namespace tests { +using RawAllocator = pten::deprecated::RawAllocator; +using Allocator = pten::deprecated::Allocator; +using Allocation = pten::deprecated::Allocation; + template bool host_allocator_test(size_t vector_size) { std::vector src(vector_size); diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index e0ae600819873..caacecf446a82 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -226,7 +226,7 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then HAS_MODIFIED_ALLOCATION=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/memory/allocation" || true` if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="You must be approved by zhiqiu and Shixiaowei02 for paddle/fluid/memory/allocation.\nIt is being modularized and refactored. Thanks!\n" - check_approval 2 6888866 39303645 + check_approval 1 6888866 39303645 fi HAS_MODIFIED_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/tensor" || true` @@ -241,23 +241,6 @@ if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 22561442 22334008 fi -ALLOCSHARED_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "*\.(h|cc)" || true` -if [ "${ALLOCSHARED_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - ERROR_LINES="" - for TEST_FILE in ${ALLOCSHARED_FILE_CHANGED}; - do - HAS_SKIP_CHECK_ALLOC_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "AllocShared" || true` - if [ "${HAS_SKIP_CHECK_ALLOC_CI}" != "" ]; then - ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_ALLOC_CI}\n" - fi - done - if [ "${ERROR_LINES}" != "" ]; then - ERROR_LINES=${ERROR_LINES//+/'\n+\t'} - echo_line="memory::AllocShared is not recommended, because it is being modularized and refactored. Please use memory::Alloc here. Otherwise, please request zhiqiu and Shixiaowei02 review and approve.\n" - check_approval 2 6888866 39303645 - fi -fi - ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true` if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n" From 5e5157812d0284f265c4d927b85d66b5bfb9c6d2 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Thu, 13 Jan 2022 11:06:09 +0800 Subject: [PATCH 03/24] Support test_imperative using_non_zero_gpu with _test_eager_guard() (#38881) * Support test_imperative using_non_zero_gpu and Add a TODO comment * Change GPU number to 0 * Modify the cuda device selection method --- .../unittests/test_imperative_numpy_bridge.py | 1 + .../test_imperative_using_non_zero_gpu.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py index 4f3089baffdd3..7b8d31ff030e5 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py @@ -42,6 +42,7 @@ def func_tensor_from_numpy(self): self.assertEqual(data_np[0][0], -1) if _in_eager_mode(): # eager_mode, var2 is EagerTensor, is not subscriptable + # TODO(wuweilong): to support slice in eager mode later self.assertNotEqual(var2.numpy()[0][0], -1) else: self.assertNotEqual(var2[0][0].numpy()[0], -1) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py index f2dfaef397797..46a89efcec491 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle import paddle.fluid as fluid import unittest -from paddle.fluid.dygraph import to_variable, Embedding, guard +from paddle.fluid.dygraph import to_variable, guard import numpy as np +from paddle.fluid.framework import _test_eager_guard class TestImperativeUsingNonZeroGpu(unittest.TestCase): @@ -24,12 +26,21 @@ def run_main(self, np_arr, place): var = to_variable(np_arr) self.assertTrue(np.array_equal(np_arr, var.numpy())) - def test_non_zero_gpu(self): + def func_non_zero_gpu(self): if not fluid.is_compiled_with_cuda(): return np_arr = np.random.random([11, 13]).astype('float32') - self.run_main(np_arr, fluid.CUDAPlace(0)) + if paddle.device.cuda.device_count() > 1: + # should use non zero gpu if there are more than 1 gpu + self.run_main(np_arr, fluid.CUDAPlace(1)) + else: + self.run_main(np_arr, fluid.CUDAPlace(0)) + + def test_non_zero_gpu(self): + with _test_eager_guard(): + self.func_non_zero_gpu() + self.func_non_zero_gpu() if __name__ == '__main__': From 281644cd0734d99151b08f8e221c2fd58a326249 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 13 Jan 2022 11:15:49 +0800 Subject: [PATCH 04/24] Fix mkldnn invalid infershape impl (#38837) * fix mkldnn invalid infershape * add unittest for mkldnn in new executor * add import os --- .../fluid/eager/legacy/infer_shape_context.h | 19 ++++++++++++++----- .../fluid/eager/legacy/prepared_operator.cc | 2 +- .../new_executor/new_executor_defs.cc | 11 +++++++++++ .../new_executor/new_executor_defs.h | 2 ++ paddle/fluid/framework/op_desc.cc | 4 ++++ paddle/fluid/framework/operator.cc | 15 ++++++++++++--- paddle/fluid/framework/operator.h | 7 ++----- paddle/fluid/framework/shape_inference.h | 2 ++ paddle/fluid/imperative/infer_shape_context.h | 19 +++++++++++++------ paddle/fluid/imperative/prepared_operator.cc | 8 ++++---- paddle/fluid/operators/batch_norm_op.cc | 6 +++--- paddle/fluid/operators/conv_op.cc | 2 +- paddle/fluid/operators/conv_transpose_op.cc | 4 ++-- paddle/fluid/operators/inplace_abn_op.cc | 8 ++++---- paddle/fluid/operators/pool_op.cc | 2 +- .../unittests/mkldnn/test_conv2d_mkldnn_op.py | 10 ++++++++++ 16 files changed, 86 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/eager/legacy/infer_shape_context.h b/paddle/fluid/eager/legacy/infer_shape_context.h index 7a05f6a9b3581..a1032fd404f85 100644 --- a/paddle/fluid/eager/legacy/infer_shape_context.h +++ b/paddle/fluid/eager/legacy/infer_shape_context.h @@ -31,15 +31,18 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { using DDim = paddle::framework::DDim; public: - EagerInferShapeContext(const NameTensorMap* in, const NameTensorMap* out, - const paddle::framework::AttributeMap* attr, - const paddle::framework::AttributeMap* default_attr, - const std::string op_type) + EagerInferShapeContext( + const NameTensorMap* in, const NameTensorMap* out, + const paddle::framework::AttributeMap* attr, + const paddle::framework::AttributeMap* default_attr, + const std::string op_type, + const paddle::framework::OpKernelType* op_kernel_type = nullptr) : tensor_in_(in), tensor_out_(out), attrs_(attr), default_attrs_(default_attr), - op_type_(op_type) {} + op_type_(op_type), + op_kernel_type_(op_kernel_type) {} bool HasInput(const std::string& name) const override { // has only one input @@ -214,6 +217,11 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { bool IsRuntime() const override { return true; } + bool IsRunMKLDNNKernel() const override { + return (op_kernel_type_ && (op_kernel_type_->data_layout_ == + paddle::framework::DataLayout::kMKLDNN)); + } + // TODO(paddle-dev): Can this be template? std::vector GetInputVarPtrs( const std::string& name) const override { @@ -400,6 +408,7 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { const paddle::framework::AttributeMap* attrs_; const paddle::framework::AttributeMap* default_attrs_; const std::string op_type_; + const paddle::framework::OpKernelType* op_kernel_type_; }; } // namespace legacy diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc index 4e892b14a9c9c..fbf2d678740ab 100644 --- a/paddle/fluid/eager/legacy/prepared_operator.cc +++ b/paddle/fluid/eager/legacy/prepared_operator.cc @@ -173,7 +173,7 @@ static void PreparedOpRunImpl( paddle::framework::Scope scope; EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs, - op.Type()); + op.Type(), &kernel_type); op.Info().infer_shape_(&infer_shape_ctx); func(EagerExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs, diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index 4b9404fd178fd..654746794da4e 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -307,6 +307,17 @@ void InterpretercoreInferShapeContext::SetLoDLevel(const std::string& out, bool InterpretercoreInferShapeContext::IsRuntime() const { return true; } +bool InterpretercoreInferShapeContext::IsRunMKLDNNKernel() const { + try { + auto& op_with_kernel = dynamic_cast(op_); + return ((op_with_kernel.kernel_type()) && + (op_with_kernel.kernel_type()->data_layout_ == + framework::DataLayout::kMKLDNN)); + } catch (std::bad_cast exp) { + return false; + } +} + // TODO(paddle-dev): Can this be template? std::vector InterpretercoreInferShapeContext::GetInputVarPtrs( const std::string& name) const { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index ca49e7f5670d6..5d63eb33d424b 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -84,6 +84,8 @@ class InterpretercoreInferShapeContext : public InferShapeContext { bool IsRuntime() const override; + bool IsRunMKLDNNKernel() const override; + // TODO(paddle-dev): Can this be template? std::vector GetInputVarPtrs( const std::string& name) const override; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 4254ec236d473..7bceeb05bac59 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -240,6 +240,8 @@ class CompileTimeInferShapeContext : public InferShapeContext { bool IsRuntime() const override; + bool IsRunMKLDNNKernel() const override; + std::vector GetInputsVarType( const std::string &name) const override { return GetVarTypes(Inputs(name)); @@ -930,6 +932,8 @@ void CompileTimeInferShapeContext::SetRepeatedDims( bool CompileTimeInferShapeContext::IsRuntime() const { return false; } +bool CompileTimeInferShapeContext::IsRunMKLDNNKernel() const { return false; } + proto::VarType::Type CompileTimeInferShapeContext::GetVarType( const std::string &name) const { return block_.FindVarRecursive(name)->GetType(); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index dc4d1365093aa..93349b8b88449 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -884,6 +884,17 @@ class RuntimeInferShapeContext : public InferShapeContext { bool IsRuntime() const override { return true; } + bool IsRunMKLDNNKernel() const override { + try { + auto& op_with_kernel = dynamic_cast(op_); + return ((op_with_kernel.kernel_type()) && + (op_with_kernel.kernel_type()->data_layout_ == + framework::DataLayout::kMKLDNN)); + } catch (std::bad_cast exp) { + return false; + } + } + // TODO(paddle-dev): Can this be template? std::vector GetInputVarPtrs( const std::string& name) const override { @@ -1178,9 +1189,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::RecordEvent record_event("infer_shape", platform::EventRole::kInnerOp); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); - // TODO(chenweihang): replace this after removing `this->IsMKLDNNType()` - // in some mkldnn infershape functions, such conv2d infershape - this->InferShape(&infer_shape_ctx); + this->Info().infer_shape_(&infer_shape_ctx); } if (FLAGS_enable_unused_var_check) { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 09e4abc77f573..8e69f96dfb813 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -528,11 +528,6 @@ class OperatorWithKernel : public OperatorBase { return g_all_op_kernels; } - bool IsMKLDNNType() const { - return ((this->kernel_type_) && (this->kernel_type_->data_layout_ == - framework::DataLayout::kMKLDNN)); - } - bool SupportGPU() const override { auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); return std::any_of(op_kernels.begin(), op_kernels.end(), @@ -609,6 +604,8 @@ class OperatorWithKernel : public OperatorBase { return pt_kernel_context_.get(); } + const OpKernelType* kernel_type() const { return kernel_type_.get(); } + private: void RunImpl(const Scope& scope, const platform::Place& place) const final; void RunImpl(const Scope& scope, const platform::Place& place, diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index 10b0fa6afd78a..791600b39c3d9 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -102,6 +102,8 @@ class InferShapeContext { virtual bool IsRuntime() const = 0; + virtual bool IsRunMKLDNNKernel() const = 0; + virtual std::vector GetInputVarPtrs( const std::string &name) const = 0; virtual std::vector GetOutputVarPtrs( diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h index 167d5682cbfdb..a16ad1688fbac 100644 --- a/paddle/fluid/imperative/infer_shape_context.h +++ b/paddle/fluid/imperative/infer_shape_context.h @@ -32,16 +32,17 @@ class DygraphInferShapeContext : public framework::InferShapeContext { using DDim = framework::DDim; public: - DygraphInferShapeContext(const NameVarMap* in, - const NameVarMap* out, - const framework::AttributeMap* attr, - const framework::AttributeMap* default_attr, - const std::string op_type) + DygraphInferShapeContext( + const NameVarMap* in, const NameVarMap* out, + const framework::AttributeMap* attr, + const framework::AttributeMap* default_attr, const std::string op_type, + const framework::OpKernelType* op_kernel_type = nullptr) : var_base_map_in_(in), var_base_map_out_(out), attrs_(attr), default_attrs_(default_attr), - op_type_(op_type) {} + op_type_(op_type), + op_kernel_type_(op_kernel_type) {} bool HasInput(const std::string& name) const override { // has only one input @@ -214,6 +215,11 @@ class DygraphInferShapeContext : public framework::InferShapeContext { bool IsRuntime() const override { return true; } + bool IsRunMKLDNNKernel() const override { + return (op_kernel_type_ && + (op_kernel_type_->data_layout_ == framework::DataLayout::kMKLDNN)); + } + // TODO(paddle-dev): Can this be template? std::vector GetInputVarPtrs( const std::string& name) const override { @@ -399,6 +405,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext { const framework::AttributeMap* attrs_; const framework::AttributeMap* default_attrs_; const std::string op_type_; + const framework::OpKernelType* op_kernel_type_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 1d12ecf30ede5..46e974c8f43f3 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -514,8 +514,8 @@ static void PreparedOpRunImpl( // TODO(zjl): remove scope in dygraph framework::Scope scope; - DygraphInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, - &default_attrs, op.Type()); + DygraphInferShapeContext infer_shape_ctx( + &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); op.Info().infer_shape_(&infer_shape_ctx); func(DygraphExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, @@ -560,8 +560,8 @@ static void PreparedOpRunPtImpl( platform::DeviceContext* dev_ctx, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - DygraphInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, - &default_attrs, op.Type()); + DygraphInferShapeContext infer_shape_ctx( + &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); op.Info().infer_shape_(&infer_shape_ctx); BuildDygraphPtenKernelContext(pt_kernel_signature, pt_kernel, ins, diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index bc5bd118dbec4..0a8e753c01dc0 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -93,7 +93,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const { x_dims, x_dims.size())); const int64_t C = - ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW) + ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW) ? x_dims[1] : x_dims[x_dims.size() - 1]); @@ -508,7 +508,7 @@ void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { ctx->Attrs().Get("data_layout")); const int C = - ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW) + ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW) ? x_dims[1] : x_dims[x_dims.size() - 1]); @@ -911,7 +911,7 @@ void BatchNormDoubleGradOp::InferShape( const DataLayout data_layout = framework::StringToDataLayout( ctx->Attrs().Get("data_layout")); const int C = - ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW) + ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW) ? x_dims[1] : x_dims[x_dims.size() - 1]); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 41f6f75200697..e500814232aae 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -57,7 +57,7 @@ std::vector ConvOp::ComputeOutputShape( // MKL-DNN Kernels are using NCHW order of dims description // so we ignore data_format consideration for MKL-DNN kernel - const bool channel_last = (this->IsMKLDNNType() == false) && + const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && (data_format == "NHWC" || data_format == "NDHWC"); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index d60786f60e9cc..12f537e2f7980 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -49,8 +49,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { const std::string data_layout_str = ctx->Attrs().Get("data_format"); const DataLayout data_layout = - this->IsMKLDNNType() ? DataLayout::kNCHW - : framework::StringToDataLayout(data_layout_str); + ctx->IsRunMKLDNNKernel() ? DataLayout::kNCHW + : framework::StringToDataLayout(data_layout_str); PADDLE_ENFORCE_EQ(in_dims.size() == 4 || in_dims.size() == 5, true, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index 8234d63d681ff..7a112292c8fc5 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -100,10 +100,10 @@ class InplaceABNGradOp : public paddle::operators::BatchNormGradOp { const DataLayout data_layout = framework::StringToDataLayout( ctx->Attrs().Get("data_layout")); - const int C = - ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW) - ? y_dims[1] - : y_dims[y_dims.size() - 1]); + const int C = ((ctx->IsRunMKLDNNKernel() == true) || + (data_layout == DataLayout::kNCHW) + ? y_dims[1] + : y_dims[y_dims.size() - 1]); ctx->SetOutputDim(framework::GradVarName("X"), y_dims); // has_scale_grad == has_bias_grad, judge has_scale_grad is enough diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index fa98e76e39338..b4ba80ae7ae2f 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -97,7 +97,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const { // MKL-DNN Kernels are using NCHW order of dims description // so we ignore data_format consideration for MKL-DNN kernel - const bool channel_last = (this->IsMKLDNNType() == false) && + const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && (data_format == "NHWC" || data_format == "NDHWC"); // update paddings if "SAME" or global_pooling diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py index 50d53864789f3..487a69807e2b0 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import numpy as np @@ -232,6 +233,15 @@ def init_group(self): self.groups = 3 +# TODO(chenweihang): To solve the coverage problem, add this unittest, +# remove this unittest after new executor set to default executor +class TestConv2dMKLDNNByNewExecutor(TestConv2DMKLDNNOp): + def test_check_output_by_new_executor(self): + os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' + self.test_check_output() + del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] + + if __name__ == '__main__': from paddle import enable_static enable_static() From fc6eed5b2789d5cdb5c84bf2fb9e41db2bcfdc5d Mon Sep 17 00:00:00 2001 From: jakpiase Date: Thu, 13 Jan 2022 04:43:45 +0100 Subject: [PATCH 05/24] Added mul BF16/FP32 FWD/BWD oneDNN kernel (#38552) * base changes for mul reimplementation * empty commit * tmp save * full implementation of mul bf16/fp32 fwd bwd * CI fix * CI rerun * changed unity build cmake to avoid gpu issues * removed mul mkldnn from unity build * added skipping tests if not cpu_bf16 * CI fix * CI fix * CI fix --- .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 109 +---------- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 176 +++++++++++++++++- paddle/fluid/operators/mul_op.cc | 36 ++++ paddle/fluid/operators/mul_op.h | 1 + paddle/fluid/operators/unity_build_rule.cmake | 1 - paddle/fluid/platform/mkldnn_reuse.h | 108 +++++++++++ .../contrib/mixed_precision/bf16/amp_lists.py | 2 +- .../fluid/tests/book/test_fit_a_line.py | 13 ++ .../mkldnn/test_mul_int8_mkldnn_op.py | 2 + .../unittests/mkldnn/test_mul_mkldnn_op.py | 159 ++++++++++++++++ 10 files changed, 490 insertions(+), 117 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index a8d4b852ca3c2..d3c7c1759641b 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -20,6 +20,7 @@ using dnnl::memory; using dnnl::primitive; using paddle::framework::DataLayout; using paddle::framework::ExecutionContext; +using paddle::platform::MatMulV2MKLDNNHandler; using paddle::platform::GetMKLDNNFormat; using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNGetDataType; @@ -107,114 +108,6 @@ std::vector GetInputStrides(const ExecutionContext& ctx, return strides; } -template -class MatMulV2MKLDNNHandler - : public paddle::platform::MKLDNNHandlerNoCachingT { - public: - MatMulV2MKLDNNHandler(const dnnl::engine engine, - paddle::platform::Place cpu_place, - const std::vector& x_org_dims, bool trans_x, - const std::vector& y_org_dims, bool trans_y, - bool is_output_fused, - const std::vector& x_strides_override, - const std::vector& y_strides_override) - : paddle::platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { - // M X K * K X N - std::vector x_dims(x_org_dims); - std::vector y_dims(y_org_dims); - - const int MB_idx = x_dims.size() - 3; - const int H_idx = x_dims.size() - 2; - const int W_idx = x_dims.size() - 1; - - if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); - if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); - - const memory::dim M = x_dims[H_idx]; - const memory::dim K = x_dims[W_idx]; - const memory::dim N = y_dims[W_idx]; - - std::vector x_strides(x_dims.size() - 3, 1); - std::vector y_strides(x_dims.size() - 3, 1); - std::vector out_strides(x_dims.size() - 3, 1); - std::vector out_ddims(x_dims.size() - 3, 1); - - x_strides.reserve(x_dims.size()); - y_strides.reserve(x_dims.size()); - out_strides.reserve(x_dims.size()); - - if (!x_strides_override.empty()) { - x_strides = x_strides_override; - } else { - if (!trans_x) { - x_strides.insert(x_strides.end(), {M * K, K, 1}); - } else { - x_strides.insert(x_strides.end(), {M * K, 1, M}); - } - } - - if (!y_strides_override.empty()) { - y_strides = y_strides_override; - } else { - if (!trans_y) { - y_strides.insert(y_strides.end(), {N * K, N, 1}); - } else { - y_strides.insert(y_strides.end(), {N * K, 1, K}); - } - } - - out_strides.insert(out_strides.end(), {M * N, N, 1}); - out_ddims.insert(out_ddims.end(), - {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); - - for (int i = x_dims.size() - 4; i >= 0; --i) { - out_ddims[i] = std::max(x_dims[i], y_dims[i]); - if (x_strides_override.empty()) { - x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; - } - if (y_strides_override.empty()) { - y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; - } - out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; - } - - if (is_output_fused) { - out_strides = FakeTransposeStrides(out_ddims); - } - - auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); - - this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md); - } - - std::vector FakeTransposeStrides( - const std::vector& matmul_out_dims) const { - // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and - // transpose axis are: {0, 2, 1, 3} - std::vector transpose_axis = {0, 2, 1, 3}; - std::vector fake_strides(transpose_axis.size()); - int ndims = static_cast(transpose_axis.size()); - - int total_stride = 1; - - for (int i = ndims - 1; i >= 0; --i) { - fake_strides[transpose_axis[i]] = total_stride; - total_stride *= matmul_out_dims[transpose_axis[i]]; - } - - return fake_strides; - } - - std::shared_ptr AcquireWeightsMemory(const Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), - to_void_cast(input_data)); - } -}; - bool IsOutputFused(const ExecutionContext& ctx) { auto& fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); auto& fused_transpose_Out = ctx.Attr>("fused_transpose_Out"); diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 0938024052271..49c896ef80fcc 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/mul_op.h" -#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace framework { @@ -32,13 +32,17 @@ namespace operators { using framework::DataLayout; using framework::DDim; using framework::ExecutionContext; +using framework::LoDTensor; using framework::Tensor; + +using platform::MatMulV2MKLDNNHandler; +using platform::MKLDNNDeviceContext; +using platform::to_void_cast; + using dnnl::inner_product_forward; using dnnl::memory; using dnnl::prop_kind; using dnnl::stream; -using platform::MKLDNNDeviceContext; -using platform::to_void_cast; template class MulPrimitiveFactory { @@ -345,7 +349,7 @@ inner_product_forward GetMulPrimitive(const MKLDNNDeviceContext &dev_ctx, /* XT: input x data type, YT: input y data type */ template -class MulMKLDNNKernel : public framework::OpKernel { +class MulMKLDNNINT8Kernel : public framework::OpKernel { public: void Compute(const ExecutionContext &ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, @@ -371,17 +375,175 @@ class MulMKLDNNKernel : public framework::OpKernel { } }; +template +class MulMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); } + + protected: + void ExecuteMatMul(const ExecutionContext &ctx, + const MKLDNNDeviceContext &dev_ctx, + const dnnl::engine &onednn_engine, + const platform::Place &cpu_place, const Tensor *x, + const std::vector &x_dims, bool trans_x, + const Tensor *y, const std::vector &y_dims, + bool trans_y, Tensor *out) const { + static const std::vector vec_placeholder; + MatMulV2MKLDNNHandler handler(onednn_engine, ctx.GetPlace(), x_dims, + trans_x, y_dims, trans_y, false, + vec_placeholder, vec_placeholder); + + const auto src_memory_p = handler.AcquireSrcMemory(x); + const auto weights_memory_p = handler.AcquireWeightsMemory(y); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto &astream = MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + // plain output formats are enforced inside handler + out->set_format(platform::MKLDNNFormatForSize( + out->dims().size(), dnnl::memory::format_tag::nchw)); + } + + private: + void RunKernel(const ExecutionContext &ctx) const { + const auto &dev_ctx = ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); + + const auto *x = ctx.Input("X"); + const auto *y = ctx.Input("Y"); + auto *out = ctx.Output("Out"); + + int x_num_col_dims = ctx.Attr("x_num_col_dims"); + int y_num_col_dims = ctx.Attr("y_num_col_dims"); + + const Tensor x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : *x; + const Tensor y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : *y; + + // adding mb dim because MatMulV2 handler needs it + std::vector y_dims(3, 1); + std::vector x_dims(3, 1); + + y_dims[1] = y_matrix.dims()[0]; + y_dims[2] = y_matrix.dims()[1]; + + x_dims[1] = x_matrix.dims()[0]; + x_dims[2] = x_matrix.dims()[1]; + + ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), &x_matrix, + x_dims, false, &y_matrix, y_dims, false, out); + } +}; + +template +class MulGradMKLDNNKernel : public MulMKLDNNKernel { + public: + void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); } + + private: + template + void RunKernel(const ExecutionContext &ctx) const { + const auto &dev_ctx = ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); + + const auto *x = ctx.Input("X"); + const auto *y = ctx.Input("Y"); + const auto *dout = ctx.Input(framework::GradVarName("Out")); + + auto *dx = ctx.Output(framework::GradVarName("X")); + auto *dy = ctx.Output(framework::GradVarName("Y")); + + int x_num_col_dims = ctx.Attr("x_num_col_dims"); + int y_num_col_dims = ctx.Attr("y_num_col_dims"); + + const Tensor x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : static_cast(*x); + const Tensor y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : static_cast(*y); + + Tensor dout_matrix = *dout; + dout_matrix.Resize( + {framework::flatten_to_2d(x->dims(), x_num_col_dims)[0], + framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); + + // adding mb dim because MatMulV2 handler needs it + std::vector x_dims(3, 1); + std::vector y_dims(3, 1); + std::vector dout_dims(3, 1); + + x_dims[1] = x_matrix.dims()[0]; + x_dims[2] = x_matrix.dims()[1]; + + y_dims[1] = y_matrix.dims()[0]; + y_dims[2] = y_matrix.dims()[1]; + + dout_dims[1] = dout_matrix.dims()[0]; + dout_dims[2] = dout_matrix.dims()[1]; + + if (dx != nullptr) { + dx->set_lod(x->lod()); + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), + &dout_matrix, dout_dims, false, &y_matrix, y_dims, + true, static_cast(dx)); + } + if (dy != nullptr) { + dy->set_lod(y->lod()); + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), + &x_matrix, x_dims, true, &dout_matrix, dout_dims, + false, static_cast(dy)); + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace, U8, ops::kMULMKLDNNINT8, - ops::MulMKLDNNKernel); + ops::MulMKLDNNINT8Kernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace, S8, ops::kMULMKLDNNINT8, - ops::MulMKLDNNKernel); + ops::MulMKLDNNINT8Kernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace, + FP32, ops::kMULMKLDNNFP32, + ops::MulMKLDNNKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + mul, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kMULMKLDNNFP32, + ops::MulMKLDNNKernel); REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace, - ops::MulMKLDNNKernel); + ops::MulMKLDNNINT8Kernel, + ops::MulMKLDNNKernel, + ops::MulMKLDNNKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul_grad, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kMULMKLDNNFP32, + ops::MulGradMKLDNNKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + mul_grad, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kMULMKLDNNFP32, + ops::MulGradMKLDNNKernel, + ops::MulGradMKLDNNKernel); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 14291f8458430..691c394870ad4 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -113,6 +113,12 @@ class MulOp : public framework::OperatorWithKernel { if (input_data_type == framework::DataTypeTrait::DataType() || input_data_type == framework::DataTypeTrait::DataType()) { customized_type_value = kMULMKLDNNINT8; + } else if (input_data_type == + framework::DataTypeTrait< + paddle::platform::bfloat16>::DataType() || + input_data_type == + framework::DataTypeTrait::DataType()) { + customized_type_value = kMULMKLDNNFP32; } } #endif @@ -233,6 +239,36 @@ class MulGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(y_grad_name, y_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + this->CanMKLDNNBeUsed(ctx, input_data_type)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + + if (input_data_type == framework::DataTypeTrait::DataType() || + input_data_type == framework::DataTypeTrait::DataType()) { + customized_type_value = kMULMKLDNNINT8; + } else if (input_data_type == + framework::DataTypeTrait< + paddle::platform::bfloat16>::DataType() || + input_data_type == + framework::DataTypeTrait::DataType()) { + customized_type_value = kMULMKLDNNFP32; + } + } +#endif + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library, customized_type_value); + } }; template diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h index 3a13e0576e347..0fb32cf4be886 100644 --- a/paddle/fluid/operators/mul_op.h +++ b/paddle/fluid/operators/mul_op.h @@ -25,6 +25,7 @@ namespace operators { using Tensor = framework::Tensor; constexpr int kMULMKLDNNINT8 = 1; +constexpr int kMULMKLDNNFP32 = 2; template class MulKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 25aef67425ef9..5ab2004617810 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -192,7 +192,6 @@ register_unity_group(cc pad_op.cc) register_unity_group(cc modified_huber_loss_op.cc - mkldnn/mul_mkldnn_op.cc partial_sum_op.cc pixel_shuffle_op.cc pool_op.cc diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index c16137b50dbf7..ef216e48416f9 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -772,6 +772,114 @@ class ReductionMKLDNNHandler } }; +template +class MatMulV2MKLDNNHandler + : public paddle::platform::MKLDNNHandlerNoCachingT { + public: + MatMulV2MKLDNNHandler(const dnnl::engine engine, + paddle::platform::Place cpu_place, + const std::vector& x_org_dims, bool trans_x, + const std::vector& y_org_dims, bool trans_y, + bool is_output_fused, + const std::vector& x_strides_override, + const std::vector& y_strides_override) + : paddle::platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + // M X K * K X N + std::vector x_dims(x_org_dims); + std::vector y_dims(y_org_dims); + + const int MB_idx = x_dims.size() - 3; + const int H_idx = x_dims.size() - 2; + const int W_idx = x_dims.size() - 1; + + if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); + if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); + + const memory::dim M = x_dims[H_idx]; + const memory::dim K = x_dims[W_idx]; + const memory::dim N = y_dims[W_idx]; + + std::vector x_strides(x_dims.size() - 3, 1); + std::vector y_strides(x_dims.size() - 3, 1); + std::vector out_strides(x_dims.size() - 3, 1); + std::vector out_ddims(x_dims.size() - 3, 1); + + x_strides.reserve(x_dims.size()); + y_strides.reserve(x_dims.size()); + out_strides.reserve(x_dims.size()); + + if (!x_strides_override.empty()) { + x_strides = x_strides_override; + } else { + if (!trans_x) { + x_strides.insert(x_strides.end(), {M * K, K, 1}); + } else { + x_strides.insert(x_strides.end(), {M * K, 1, M}); + } + } + + if (!y_strides_override.empty()) { + y_strides = y_strides_override; + } else { + if (!trans_y) { + y_strides.insert(y_strides.end(), {N * K, N, 1}); + } else { + y_strides.insert(y_strides.end(), {N * K, 1, K}); + } + } + + out_strides.insert(out_strides.end(), {M * N, N, 1}); + out_ddims.insert(out_ddims.end(), + {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); + + for (int i = x_dims.size() - 4; i >= 0; --i) { + out_ddims[i] = std::max(x_dims[i], y_dims[i]); + if (x_strides_override.empty()) { + x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; + } + if (y_strides_override.empty()) { + y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; + } + out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; + } + + if (is_output_fused) { + out_strides = FakeTransposeStrides(out_ddims); + } + + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); + + this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md); + } + + std::vector FakeTransposeStrides( + const std::vector& matmul_out_dims) const { + // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and + // transpose axis are: {0, 2, 1, 3} + std::vector transpose_axis = {0, 2, 1, 3}; + std::vector fake_strides(transpose_axis.size()); + int ndims = static_cast(transpose_axis.size()); + + int total_stride = 1; + + for (int i = ndims - 1; i >= 0; --i) { + fake_strides[transpose_axis[i]] = total_stride; + total_stride *= matmul_out_dims[transpose_axis[i]]; + } + + return fake_strides; + } + + std::shared_ptr AcquireWeightsMemory(const Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), + to_void_cast(input_data)); + } +}; + template class ActivationMKLDNNHandler : public MKLDNNHandlerNoCachingT> 16)) + out = numpy.reshape(out, in_list.shape).view(numpy.uint16) + return out + + def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16): x = fluid.layers.data(name='x', shape=[13], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32') @@ -158,6 +167,10 @@ def infer(use_cuda, save_dirname=None, use_bf16=False): test_data = next(test_reader()) test_feat = numpy.array( [data[0] for data in test_data]).astype("float32") + + if use_bf16: + test_feat = convert_float_to_uint16(test_feat) + test_label = numpy.array( [data[1] for data in test_data]).astype("float32") diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py index 0c91868d30245..9265d5f7edfbb 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py @@ -16,6 +16,7 @@ import unittest import numpy as np +import paddle import paddle.fluid.core as core from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci ''' @@ -159,4 +160,5 @@ def init_data_type(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py new file mode 100644 index 0000000000000..a0581d791209d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py @@ -0,0 +1,159 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from numpy.matrixlib import defmatrix +import paddle +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, OpTestTool + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestMulOneDNNOp(OpTest): + def setUp(self): + self.op_type = "mul" + self.attrs = {'use_mkldnn': True} + self.init_shapes_and_attrs() + + self.x_fp32 = np.random.random(self.x_shape).astype(np.float32) + self.y_fp32 = np.random.random(self.y_shape).astype(np.float32) + + self.x = self.x_fp32 + self.y = self.y_fp32 + + self.init_inputs_dtype() + + self.inputs = {'X': self.x, 'Y': self.y} + + output = np.dot( + np.reshape(self.x_fp32, self.np_x_shape), + np.reshape(self.y_fp32, self.np_y_shape)) + self.outputs = {'Out': np.reshape(output, self.out_shape)} + + def init_shapes_and_attrs(self): + self.x_shape = (20, 5) + self.y_shape = (5, 21) + + self.np_x_shape = (20, 5) + self.np_y_shape = (5, 21) + + self.out_shape = (20, 21) + + def init_inputs_dtype(self): + pass + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + self.check_grad_with_place(core.CPUPlace(), ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad_with_place(core.CPUPlace(), ['Y'], 'Out', set('X')) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place(core.CPUPlace(), ['X'], 'Out', set('Y')) + + +class TestMulXNumColDims2OneDNNOp(TestMulOneDNNOp): + def init_shapes_and_attrs(self): + self.x_shape = (6, 7, 5) + self.y_shape = (5, 21) + + self.np_x_shape = (42, 5) + self.np_y_shape = (5, 21) + + self.out_shape = (6, 7, 21) + + self.attrs["x_num_col_dims"] = 2 + + +class TestMulYNumColDims2OneDNNOp(TestMulOneDNNOp): + def init_shapes_and_attrs(self): + self.x_shape = (20, 6) + self.y_shape = (2, 3, 21) + + self.np_x_shape = (20, 6) + self.np_y_shape = (6, 21) + + self.out_shape = (20, 21) + + self.attrs["y_num_col_dims"] = 2 + + +class TestMulYAndXNumColDims2OneDNNOp(TestMulOneDNNOp): + def init_shapes_and_attrs(self): + self.x_shape = (10, 5, 6) + self.y_shape = (2, 3, 21) + + self.np_x_shape = (50, 6) + self.np_y_shape = (6, 21) + + self.out_shape = (10, 5, 21) + + self.attrs["x_num_col_dims"] = 2 + self.attrs["y_num_col_dims"] = 2 + + +class TestMulBF16OneDNNOp(TestMulOneDNNOp): + def init_inputs_dtype(self): + self.x = convert_float_to_uint16(self.x) + self.y = convert_float_to_uint16(self.y) + + def calculate_grads(self): + x_np = np.reshape(self.x_fp32, self.np_x_shape) + y_np = np.reshape(self.y_fp32, self.np_y_shape) + + self.dout = self.outputs['Out'] + self.dout_np = np.reshape(self.dout, (x_np.shape[0], y_np.shape[1])) + + y_np_trans = np.transpose(y_np, (1, 0)) + x_np_trans = np.transpose(x_np, (1, 0)) + + self.dx = np.matmul(self.dout_np, y_np_trans) + self.dy = np.matmul(x_np_trans, self.dout_np) + + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ['X', 'Y'], + 'Out', + user_defined_grads=[self.dx, self.dy], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + def test_check_grad_ingore_x(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ['Y'], + 'Out', + set('X'), + user_defined_grads=[self.dy], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + def test_check_grad_ingore_y(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ['X'], + 'Out', + set('Y'), + user_defined_grads=[self.dx], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() From 08dcea18edaf19ef1eeea1a8905e28d6f318d211 Mon Sep 17 00:00:00 2001 From: wenbin Date: Thu, 13 Jan 2022 14:00:27 +0800 Subject: [PATCH 06/24] roi_align aligned supported (#38905) roi_align aligned supported --- .../tensorrt/convert/roi_align_op.cc | 4 +- paddle/fluid/inference/tensorrt/op_teller.cc | 30 --------- .../tensorrt/plugin/roi_align_op_plugin.cu | 64 +++++++++++-------- .../tensorrt/plugin/roi_align_op_plugin.h | 4 +- .../inference/test_trt_convert_roi_align.py | 10 --- 5 files changed, 45 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc index 654fe7e013379..54f7937d83747 100644 --- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc @@ -51,6 +51,7 @@ class RoiAlignOpConverter : public OpConverter { BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale")); const auto sampling_ratio = BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio")); + const auto aligned = BOOST_GET_CONST(bool, op_desc.GetAttr("aligned")); const auto input_tensor = engine_->GetITensor(input_name); const auto rois_tensor = engine_->GetITensor(rois_name); @@ -63,7 +64,8 @@ class RoiAlignOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic( - data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio); + data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio, + aligned); auto roi_align_layer = engine_->network()->addPluginV2( inputs.data(), inputs.size(), *roi_align_plugin); layer = roi_align_layer; diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 878eef016e7d1..ddee4e0d682b0 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -13,9 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/op_teller.h" - #include - #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/data_layout.h" @@ -737,28 +735,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } - if (op_type == "roi_align") { - if (!with_dynamic_shape) return false; - - std::vector attrs{"pooled_height", "pooled_width", - "spatial_scale", "sampling_ratio"}; - for (auto const attr : attrs) { - if (!desc.HasAttr(attr)) return false; - } - - const auto pooled_height = - BOOST_GET_CONST(int, desc.GetAttr("pooled_height")); - if (pooled_height <= 0) return false; - - const auto pooled_width = - BOOST_GET_CONST(int, desc.GetAttr("pooled_width")); - if (pooled_width <= 0) return false; - - const auto spatial_scale = - BOOST_GET_CONST(float, desc.GetAttr("spatial_scale")); - if (spatial_scale <= 0.f) return false; - } - if (op_type == "hard_swish") { if (desc.Input("X").size() != 1) { VLOG(3) << "HardSwish op has only 1 input, but got " @@ -1303,12 +1279,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, BOOST_GET_CONST(float, desc.GetAttr("spatial_scale")); if (spatial_scale <= 0.f) return false; - const auto sampling_ratio = - BOOST_GET_CONST(int, desc.GetAttr("sampling_ratio")); - const auto aligned = BOOST_GET_CONST(bool, desc.GetAttr("aligned")); - - if (sampling_ratio == -1 && aligned == true) return false; - auto roi_align_inputs = desc.Inputs(); if (roi_align_inputs.find("RoisNum") != roi_align_inputs.end()) { if (desc.Input("RoisNum").size() >= 1) { diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu index 06540b3626082..7dc31fb44719a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu @@ -58,14 +58,12 @@ __inline__ __device__ T BilinearInterpolate(const T* input_data, } template -__global__ void GPUROIAlignOpt(const int nthreads, - const T* __restrict__ input_data, - const T* __restrict__ input_rois, - const float spatial_scale, const int channels, - const int height, const int width, - const int pooled_height, const int pooled_width, - const int sampling_ratio, const int num_rois, - OutT* __restrict__ output_data) { +__global__ void GPUROIAlignOpt( + const int nthreads, const T* __restrict__ input_data, + const T* __restrict__ input_rois, const float spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int sampling_ratio, + const int num_rois, const bool aligned, OutT* __restrict__ output_data) { const int batch = blockIdx.x; const int channel = blockIdx.y; const T* offset_input_data = @@ -84,21 +82,28 @@ __global__ void GPUROIAlignOpt(const int nthreads, const int roi_idx = (idx / pooled_width / pooled_height) % num_rois; const int n = batch * num_rois + roi_idx; const float4 rois_offset = reinterpret_cast(input_rois)[n]; - const T roi_xmin = rois_offset.x * spatial_scale; - const T roi_ymin = rois_offset.y * spatial_scale; - const T roi_xmax = rois_offset.z * spatial_scale; - const T roi_ymax = rois_offset.w * spatial_scale; - const T roi_width = max(roi_xmax - roi_xmin, static_cast(1.f)); - const T roi_height = max(roi_ymax - roi_ymin, static_cast(1.f)); - const T bin_size_h = roi_height / static_cast(pooled_height); - const T bin_size_w = roi_width / static_cast(pooled_width); + const T roi_offset = aligned ? static_cast(0.5) : 0; + const T roi_xmin = rois_offset.x * spatial_scale - roi_offset; + const T roi_ymin = rois_offset.y * spatial_scale - roi_offset; + const T roi_xmax = rois_offset.z * spatial_scale - roi_offset; + const T roi_ymax = rois_offset.w * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!aligned) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } + const T bin_size_h = + static_cast(roi_height) / static_cast(pooled_height); + const T bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); const int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); const int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - const T count = roi_bin_grid_h * roi_bin_grid_w; - + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); T output_val = 0.f; for (int iy = 0; iy < roi_bin_grid_h; ++iy) { const T y = roi_ymin + ph * bin_size_h + @@ -132,12 +137,13 @@ RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type, const int pooled_height, const int pooled_width, float spatial_scale, - int sampling_ratio) + int sampling_ratio, bool aligned) : data_type_(data_type), pooled_height_(pooled_height), pooled_width_(pooled_width), spatial_scale_(spatial_scale), - sampling_ratio_(sampling_ratio) { + sampling_ratio_(sampling_ratio), + aligned_(aligned) { bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT || data_type_ == nvinfer1::DataType::kHALF; PADDLE_ENFORCE_EQ(data_type_is_valid, true, @@ -187,6 +193,7 @@ RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) { DeserializeValue(&data, &length, &pooled_width_); DeserializeValue(&data, &length, &spatial_scale_); DeserializeValue(&data, &length, &sampling_ratio_); + DeserializeValue(&data, &length, &aligned_); int smem_per_block = -1; int device = -1; cudaGetDevice(&device); @@ -204,7 +211,7 @@ nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const TRT_NOEXCEPT { auto* plugin = new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_, - spatial_scale_, sampling_ratio_); + spatial_scale_, sampling_ratio_, aligned_); plugin->setPluginNamespace(namespace_.c_str()); return plugin; } @@ -272,14 +279,15 @@ int RoiAlignPluginDynamic::enqueue_impl( output_size, static_cast(inputs[0]), static_cast(inputs[1]), spatial_scale_, channels, height, width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, - static_cast(outputs[0])); + aligned_, static_cast(outputs[0])); } else { GPUROIAlignOpt< - T, OutT, true><<>>( + T, OutT, + false><<>>( output_size, static_cast(inputs[0]), static_cast(inputs[1]), spatial_scale_, channels, height, width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, - static_cast(outputs[0])); + aligned_, static_cast(outputs[0])); } return cudaGetLastError() != cudaSuccess; @@ -313,6 +321,10 @@ const char* RoiAlignPluginDynamic::getPluginType() const TRT_NOEXCEPT { return "roi_align_plugin_dynamic"; } +const char* RoiAlignPluginDynamic::getPluginVersion() const TRT_NOEXCEPT { + return "2"; +} + int RoiAlignPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; } int RoiAlignPluginDynamic::initialize() TRT_NOEXCEPT { return 0; } @@ -326,6 +338,7 @@ size_t RoiAlignPluginDynamic::getSerializationSize() const TRT_NOEXCEPT { serialize_size += SerializedSize(pooled_width_); serialize_size += SerializedSize(spatial_scale_); serialize_size += SerializedSize(sampling_ratio_); + serialize_size += SerializedSize(aligned_); return serialize_size; } @@ -335,6 +348,7 @@ void RoiAlignPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT { SerializeValue(&buffer, pooled_width_); SerializeValue(&buffer, spatial_scale_); SerializeValue(&buffer, sampling_ratio_); + SerializeValue(&buffer, aligned_); } void RoiAlignPluginDynamic::destroy() TRT_NOEXCEPT {} @@ -357,7 +371,7 @@ const char* RoiAlignPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT { const char* RoiAlignPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT { - return "1"; + return "2"; } const nvinfer1::PluginFieldCollection* diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h index 44d2b63069835..9f4723da9e17b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h @@ -31,7 +31,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT { explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type, const int pooled_height, const int pooled_width, float spatial_scale, - int sampling_ratio); + int sampling_ratio, bool aligned); RoiAlignPluginDynamic(void const* data, size_t length); ~RoiAlignPluginDynamic() = default; nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override; @@ -66,6 +66,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT { size_t getSerializationSize() const TRT_NOEXCEPT override; void serialize(void* buffer) const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; + const char* getPluginVersion() const TRT_NOEXCEPT override; private: template @@ -80,6 +81,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT { float spatial_scale_; int sampling_ratio_; int smem_per_block_; + bool aligned_; std::string namespace_; }; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py index 56efdb91959ce..b2d754337fe02 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py @@ -176,16 +176,6 @@ def teller1(program_config, predictor_config): self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT RoisNum NOT SUPPORT") - def teller2(program_config, predictor_config): - if (program_config.ops[0].attrs['sampling_ratio'] == -1 and - program_config.ops[0].attrs['aligned'] == True): - return True - return False - - self.add_skip_case( - teller2, SkipReasons.TRT_NOT_SUPPORT, - "SAMPLING_RATIO EQUAL TO - 1 WHEN ALIGNED IS TRUE IS NOT SUPPORT") - def test(self): self.add_skip_trt_case() self.run_test() From a6cf6cddd323436b0e441aeb6f67a9a5da6c2172 Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Thu, 13 Jan 2022 14:32:22 +0800 Subject: [PATCH 07/24] [fleet_executor] fix uninitialized pointer (#38904) --- paddle/fluid/distributed/fleet_executor/carrier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index 7762effdb9c87..9a74fa78c0e76 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -101,8 +101,8 @@ class Carrier final { std::mutex running_mutex_; std::condition_variable cond_var_; std::vector microbatch_scopes_; - framework::Scope* root_scope_; - framework::Scope* minibatch_scope_; + framework::Scope* root_scope_{nullptr}; + framework::Scope* minibatch_scope_{nullptr}; paddle::platform::Place place_; paddle::platform::DeviceContext* dev_ctx_{nullptr}; int64_t rank_; From 53783e1e3d972a5eccb4936ce0ef9ee4aa292a96 Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Thu, 13 Jan 2022 14:48:24 +0800 Subject: [PATCH 08/24] [Dist Pass] AMP pass add dist_update_loss_scaling op (#38902) --- .../auto_parallel/operators/__init__.py | 1 + .../auto_parallel/operators/common.py | 2 +- .../operators/dist_update_loss_scaling.py | 134 ++++++++++++++++++ 3 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index 5502cb3191a48..c28b7930124dd 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -24,3 +24,4 @@ from . import dist_transpose from . import dist_default from . import dist_check_finite_and_unscale +from . import dist_update_loss_scaling diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index 32496b94b920c..8f1ba33f544fb 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -15,7 +15,7 @@ from ..dist_attribute import OperatorDistributedAttribute _g_distributed_operator_impl_registries = {} -BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale'} +BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'} class DistributedOperatorImplContainer: diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py new file mode 100644 index 0000000000000..56782bec0856a --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from ..utils import set_dist_op_desc_original_id + + +class DistributedUpdateLossScaling(DistributedOperatorImplContainer): + def __init__(self, name): + super(DistributedUpdateLossScaling, self).__init__() + self._name = name + + +register_distributed_operator_impl_container( + "update_loss_scaling", DistributedUpdateLossScaling("update_loss_scaling")) + + +class DistributedUpdateLossScalingImpl(DistributedOperatorImpl): + def __init__(self, name): + super(DistributedUpdateLossScalingImpl, self).__init__() + self._name = name + self._forward_implemented = False + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + raise RuntimeError( + "DistributedUpdateLossScalingImpl's is_input_compatible should not be called !" + ) + + def is_output_compatible(self, dist_op): + raise RuntimeError( + "DistributedUpdateLossScalingImpl's is_output_compatible should not be called !" + ) + + def update_dims_mapping(self, dist_op): + raise RuntimeError( + "DistributedUpdateLossScalingImpl's update_dims_mapping should not be called !" + ) + + @staticmethod + def forward(ctx, *args, **kwargs): + raise RuntimeError( + "DistributedUpdateLossScalingImpl's forward should not be called !") + + @staticmethod + def backward(ctx, *args, **kwargs): + + # the backward function only filte the gradient with current rank id + dist_op_context = ctx.dist_op_context + main_block = dist_op_context.get_dst_main_program().global_block() + backward_op = dist_op_context.get_cur_src_op() + rank_id = dist_op_context.get_rank_id() + dist_attr = ctx.get_op_dist_attr_for_program(backward_op) + assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(backward_op)) + + assert rank_id in dist_attr.process_mesh.processes + + assert 'X' in kwargs, "input [{}] is not given".format('X') + assert 'FoundInfinite' in kwargs, "input [{}] is not given".format( + 'FoundInfinite') + assert 'PrevLossScaling' in kwargs, "input [{}] is not given".format( + 'PrevLossScaling') + assert 'InGoodSteps' in kwargs, "input [{}] is not given".format( + 'InGoodSteps') + assert 'InBadSteps' in kwargs, "input [{}] is not given".format( + 'InBadSteps') + + assert 'Out' in kwargs, "output [{}] is not given".format('Out') + assert 'LossScaling' in kwargs, "output [{}] is not given".format( + 'LossScaling') + assert 'OutGoodSteps' in kwargs, "input [{}] is not given".format( + 'OutGoodSteps') + assert 'OutBadSteps' in kwargs, "input [{}] is not given".format( + 'OutBadSteps') + + assert len(kwargs['FoundInfinite']) == 1, \ + "update_loss_scaling input FoundInfinite take 1 variable but got {}".format( + kwargs['FoundInfinite']) + assert len(kwargs['PrevLossScaling']) == 1, \ + "update_loss_scaling input PrevLossScaling take 1 variable but got {}".format( + kwargs['PrevLossScaling']) + assert len(kwargs['InGoodSteps']) == 1, \ + "update_loss_scaling input InGoodSteps take 1 variable but got {}".format( + kwargs['InGoodSteps']) + assert len(kwargs['InBadSteps']) == 1, \ + "update_loss_scaling input InBadSteps take 1 variable but got {}".format( + kwargs['InBadSteps']) + assert len(kwargs['LossScaling']) == 1, \ + "update_loss_scaling output LossScaling take 1 variable but got {}".format( + kwargs['LossScaling']) + assert len(kwargs['OutGoodSteps']) == 1, \ + "update_loss_scaling output OutGoodSteps take 1 variable but got {}".format( + kwargs['OutGoodSteps']) + assert len(kwargs['OutBadSteps']) == 1, \ + "update_loss_scaling output OutBadSteps take 1 variable but got {}".format( + kwargs['OutBadSteps']) + + assert len(kwargs['X']) == len(kwargs['Out']), \ + "update_loss_scaling got [{}] X and [{}] Out, which are supposed to be equal".format( + len(kwargs['X']), len(kwargs['Out'])) + + filter_vars = [] + for varname in kwargs['X']: + if rank_id in ctx.get_tensor_dist_attr_for_program( + main_block.var(varname)).process_mesh.processes: + filter_vars.append(varname) + + # replicate op in dist program + dist_op_desc = main_block.desc.append_op() + dist_op_desc.copy_from(backward_op.desc) + set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx) + dist_op_desc.set_input('X', filter_vars) + dist_op_desc.set_output('Out', filter_vars) + main_block._sync_with_cpp() + + +register_distributed_operator_impl( + "update_loss_scaling", + DistributedUpdateLossScalingImpl("update_loss_scaling")) From 7e0292ead7d8c0632135e5480870e4c6bdf93acd Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Thu, 13 Jan 2022 14:51:17 +0800 Subject: [PATCH 09/24] [pten]Remove pten/include dir files (#38878) * move dot_dev api into dot_kernel.h * add infermate header * modify to dotkerel in dot_op.h * mvoe conj dev api into complex_kernel.h * move sign dev api into sign_kernel.h * move scale dev api into kernel.h and remove infermete.h * rm paddle/pten/include/math.h * rm paddle/pten/include/math.h * rm include dir * rm paddle/pten/include/math.h * fix conflict with develop branch * rm devContext in conj_op.h * add the missing complex_kernel header --- .../eager/accumulation/accumulation_node.cc | 1 - .../accumulation/gradient_accumulation.cc | 1 - .../eager_generated/backwards/scale_node.cc | 16 +++--- .../eager_generated/forwards/scale.cc | 1 - paddle/fluid/eager/eager_tensor.h | 1 - paddle/fluid/eager/grad_node_info.h | 1 - .../eager/legacy/infer_var_type_context.h | 1 - paddle/fluid/eager/legacy/prepared_operator.h | 2 - paddle/fluid/eager/legacy/tensor_helper.h | 1 - .../framework/data_device_transform_test.cu | 1 - paddle/fluid/framework/operator.h | 3 +- paddle/fluid/imperative/layer.h | 1 - paddle/fluid/imperative/op_base.h | 1 - paddle/fluid/imperative/prepared_operator.h | 2 - paddle/fluid/operators/cast_op.h | 1 - paddle/fluid/operators/conj_op.h | 3 +- paddle/fluid/operators/dot_op.h | 1 - .../elementwise/elementwise_add_op.h | 1 - .../elementwise/elementwise_mul_op.h | 1 - .../elementwise/elementwise_op_function.h | 1 - .../elementwise/elementwise_op_impl.cu.h | 1 - .../elementwise/elementwise_sub_op.h | 1 - paddle/fluid/operators/fill_any_like_op.h | 1 - paddle/fluid/operators/flatten_op.h | 1 - paddle/fluid/operators/matmul_v2_op.h | 1 - paddle/fluid/operators/reduce_ops/reduce_op.h | 2 - paddle/fluid/operators/reshape_op.cc | 1 - paddle/fluid/operators/scale_op.h | 5 +- paddle/fluid/operators/sign_op.h | 1 - paddle/fluid/pybind/eager.cc | 1 - paddle/fluid/pybind/eager_functions.cc | 1 - paddle/fluid/pybind/eager_method.cc | 1 - paddle/fluid/pybind/eager_properties.cc | 1 - paddle/fluid/pybind/eager_utils.cc | 1 - paddle/pten/CMakeLists.txt | 2 +- paddle/pten/all.cc | 17 ------- paddle/pten/all.h | 20 -------- paddle/pten/api/lib/utils.cc | 3 +- paddle/pten/include/core.h | 22 -------- paddle/pten/include/infermeta.h | 21 -------- paddle/pten/include/math.h | 39 --------------- paddle/pten/kernels/complex_kernel.h | 3 -- paddle/pten/kernels/cpu/scale_kernel.cc | 34 ++++++++++++- paddle/pten/kernels/flatten_kernel.h | 2 +- paddle/pten/kernels/gpu/scale_kernel.cu | 14 +++--- .../kernels/impl/matmul_grad_kernel_impl.h | 3 +- paddle/pten/kernels/impl/scale_kernel_impl.h | 50 ------------------- paddle/pten/kernels/math_kernel.h | 3 +- paddle/pten/kernels/reshape_kernel.h | 2 +- paddle/pten/kernels/scale_kernel.h | 28 ++++++++--- paddle/pten/kernels/sign_kernel.h | 2 +- paddle/pten/tests/api/scale_api.h | 35 +++++++------ .../pten/tests/kernels/test_scale_dev_api.cc | 2 +- python/paddle/utils/code_gen/api_gen.py | 6 ++- 54 files changed, 103 insertions(+), 265 deletions(-) delete mode 100644 paddle/pten/all.cc delete mode 100644 paddle/pten/all.h delete mode 100644 paddle/pten/include/core.h delete mode 100644 paddle/pten/include/infermeta.h delete mode 100644 paddle/pten/include/math.h delete mode 100644 paddle/pten/kernels/impl/scale_kernel_impl.h diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index ed1146eed0fb0..823c0153d71b0 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -18,7 +18,6 @@ #include "paddle/pten/api/all.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/core.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc index 9bc24dd28756a..1f66596a0b578 100644 --- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc +++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc @@ -28,7 +28,6 @@ #include "paddle/fluid/platform/float16.h" #include "paddle/pten/api/all.h" #include "paddle/pten/core/convert_utils.h" -#include "paddle/pten/include/core.h" #include "unsupported/Eigen/CXX11/Tensor" #ifdef PADDLE_WITH_XPU #include "xpu/refactor/math.h" diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 02eaa79fc9b28..99f6c7a83538e 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/eager_tensor.h" -#include "paddle/pten/api/all.h" +#include "paddle/pten/kernels/scale_kernel.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" @@ -33,28 +33,28 @@ static void ScaleDeviceDispatch(const pten::DenseTensor& dense_tensor, pten::DenseTensor* dense_out) { switch (dense_tensor.dtype()) { case pten::DataType::FLOAT64: { - pten::Scale( + pten::ScaleKernel( dev_ctx, dense_tensor /* tensor */, scale /* scale */, bias /* bias */, bias_after_scale /* bias_after_scale */, dense_out /* out tensor */); break; } case pten::DataType::FLOAT32: { - pten::Scale(dev_ctx, dense_tensor /* tensor */, - scale /* scale */, bias /* bias */, - bias_after_scale /* bias_after_scale */, - dense_out /* out tensor */); + pten::ScaleKernel( + dev_ctx, dense_tensor /* tensor */, scale /* scale */, + bias /* bias */, bias_after_scale /* bias_after_scale */, + dense_out /* out tensor */); break; } case pten::DataType::INT64: { - pten::Scale( + pten::ScaleKernel( dev_ctx, dense_tensor /* tensor */, scale /* scale */, bias /* bias */, bias_after_scale /* bias_after_scale */, dense_out /* out tensor */); break; } case pten::DataType::INT32: { - pten::Scale( + pten::ScaleKernel( dev_ctx, dense_tensor /* tensor */, scale /* scale */, bias /* bias */, bias_after_scale /* bias_after_scale */, dense_out /* out tensor */); diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc index 7b20ff144a7a7..642302a4119be 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc @@ -29,7 +29,6 @@ #include "paddle/fluid/eager/utils.h" #include "paddle/pten/api/all.h" -#include "paddle/pten/include/core.h" namespace egr { diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h index 80faad9080ffe..c58c0b9e66e7a 100644 --- a/paddle/fluid/eager/eager_tensor.h +++ b/paddle/fluid/eager/eager_tensor.h @@ -18,7 +18,6 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" // pten deps -#include "paddle/pten/all.h" #include "paddle/pten/api/all.h" #include "paddle/pten/api/lib/api_declare.h" #include "paddle/pten/api/lib/utils/tensor_utils.h" diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index f15c50ef75190..5cf0b90220148 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -16,7 +16,6 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/pten/api/all.h" -#include "paddle/pten/include/core.h" namespace egr { /** diff --git a/paddle/fluid/eager/legacy/infer_var_type_context.h b/paddle/fluid/eager/legacy/infer_var_type_context.h index 2d5a8d806fee7..9d9cbeb38ccfa 100644 --- a/paddle/fluid/eager/legacy/infer_var_type_context.h +++ b/paddle/fluid/eager/legacy/infer_var_type_context.h @@ -26,7 +26,6 @@ #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/pten/api/all.h" -#include "paddle/pten/include/core.h" namespace egr { namespace legacy { diff --git a/paddle/fluid/eager/legacy/prepared_operator.h b/paddle/fluid/eager/legacy/prepared_operator.h index 9ba186b14e3b3..0e00b52e0481a 100644 --- a/paddle/fluid/eager/legacy/prepared_operator.h +++ b/paddle/fluid/eager/legacy/prepared_operator.h @@ -25,8 +25,6 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/type_defs.h" -#include "paddle/pten/include/core.h" - DECLARE_bool(use_mkldnn); namespace paddle { diff --git a/paddle/fluid/eager/legacy/tensor_helper.h b/paddle/fluid/eager/legacy/tensor_helper.h index f87ab70c93686..ce407f8965aa0 100644 --- a/paddle/fluid/eager/legacy/tensor_helper.h +++ b/paddle/fluid/eager/legacy/tensor_helper.h @@ -17,7 +17,6 @@ #include #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/pten/api/all.h" -#include "paddle/pten/include/core.h" namespace egr { namespace legacy { diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index a81e4abd45e56..858688dffd8c1 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/platform/init.h" #include "paddle/fluid/framework/pten_utils.h" -#include "paddle/pten/include/core.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 8e69f96dfb813..9d75c66beb7d4 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -41,7 +41,8 @@ limitations under the License. */ #include "paddle/utils/flat_hash_map.h" #include "paddle/pten/core/arg_map_context.h" -#include "paddle/pten/include/core.h" +#include "paddle/pten/core/kernel_context.h" +#include "paddle/pten/core/kernel_factory.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 199d62bff1f20..d27460aeeccef 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -37,7 +37,6 @@ #include "paddle/fluid/imperative/variable_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" -#include "paddle/pten/include/core.h" namespace paddle { namespace framework { class Variable; diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index 3ff451f817872..cb76a82353282 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -25,7 +25,6 @@ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/variable_wrapper.h" #include "paddle/fluid/platform/place.h" -#include "paddle/pten/include/core.h" namespace paddle { namespace imperative { diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 5262b265b1b53..29747e79ef6fa 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -27,8 +27,6 @@ #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/type_defs.h" -#include "paddle/pten/include/core.h" - DECLARE_bool(use_mkldnn); namespace paddle { diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h index 72aa9a195ec7c..c54c811b25b66 100644 --- a/paddle/fluid/operators/cast_op.h +++ b/paddle/fluid/operators/cast_op.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/platform/transform.h" #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/cast_kernel.h" namespace paddle { diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h index 71115c2eba796..6df982abb8612 100644 --- a/paddle/fluid/operators/conj_op.h +++ b/paddle/fluid/operators/conj_op.h @@ -19,7 +19,6 @@ // only can include the headers in paddle/pten/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/complex_kernel.h" namespace paddle { @@ -39,7 +38,7 @@ class ConjKernel : public framework::OpKernel { auto pt_out = paddle::experimental::MakePtenDenseTensor(*out); // call new kernel - pten::ConjKernel(dev_ctx, *pt_x.get(), pt_out.get()); + pten::ConjKernel(dev_ctx, *pt_x.get(), pt_out.get()); } }; diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index 8817e2f3ca79d..ceb8a28e8aa4c 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -21,7 +21,6 @@ // only can include the headers in paddle/pten/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/dot_grad_kernel.h" #include "paddle/pten/kernels/dot_kernel.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 35807d7c57d47..622a6d7edb783 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" -// only can include the headers in paddle/pten/include dirs #include "paddle/pten/kernels/math_kernel.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 385c7549e07f2..687340b668a13 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/cpu_info.h" -// only can include the headers in paddle/pten/include dirs #include "paddle/pten/kernels/math_kernel.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 37d29ed91b3d4..626046890fb06 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -29,7 +29,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/transform.h" -// only can include the headers in paddle/pten/include dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" #include "paddle/pten/kernels/cpu/elementwise.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 36ff1ae254d20..9cc741344e50e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -22,7 +22,6 @@ limitations under the License. */ // only can include the headers in paddle/top/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/gpu/elementwise.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 09818380d8ea7..f035e46d1d082 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" -// only can include the headers in paddle/pten/include dirs #include "paddle/pten/kernels/math_kernel.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h index 287bbbfa3b343..19f6e7a4ef51f 100644 --- a/paddle/fluid/operators/fill_any_like_op.h +++ b/paddle/fluid/operators/fill_any_like_op.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/pten_utils.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/full_kernel.h" namespace paddle { diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index ef42619bfe4ff..8e54ecb922f5a 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/empty_kernel.h" #include "paddle/pten/kernels/flatten_grad_kernel.h" #include "paddle/pten/kernels/flatten_kernel.h" diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h index e93bd212868fd..9ab77cdcaec0a 100644 --- a/paddle/fluid/operators/matmul_v2_op.h +++ b/paddle/fluid/operators/matmul_v2_op.h @@ -27,7 +27,6 @@ limitations under the License. */ // only can include the headers in paddle/pten/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/matmul_grad_kernel.h" #include "paddle/pten/kernels/matmul_kernel.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index e1854d8a13d8b..eb4d4a5c1680e 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -26,8 +26,6 @@ limitations under the License. */ // only can include the headers in paddle/pten/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/include/core.h" -#include "paddle/pten/include/math.h" #include "paddle/pten/kernels/cpu/reduce.h" #if defined(__HIPCC__) || defined(__NVCC__) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index a25e53aac5d73..47b8da70adbac 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -20,7 +20,6 @@ limitations under the License. */ // only can include the headers in paddle/pten/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" #include "paddle/pten/common/scalar_array.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/reshape_grad_kernel.h" #include "paddle/pten/kernels/reshape_kernel.h" namespace paddle { diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index 6011fe9a66b60..a6f4f6e27204e 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -19,7 +19,6 @@ limitations under the License. */ // only can include the headers in paddle/top/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/scale_kernel.h" namespace paddle { @@ -70,8 +69,8 @@ class ScaleKernel : public framework::OpKernel { auto pt_out = paddle::experimental::MakePtenDenseTensor(*out); // call new kernel - pten::Scale(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale, - pt_out.get()); + pten::ScaleKernel(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale, + pt_out.get()); } }; diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h index b8dd44c01b050..8294cd2c5f145 100644 --- a/paddle/fluid/operators/sign_op.h +++ b/paddle/fluid/operators/sign_op.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/pten/include/core.h" #include "paddle/pten/kernels/sign_kernel.h" namespace paddle { diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 9484d506b20fb..102bc9f162b0f 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/pten/common/data_type.h" #include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/core.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" #pragma GCC diagnostic ignored "-Wmissing-field-initializers" diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 659df6b9b44de..aaf86bc41aeff 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -34,7 +34,6 @@ limitations under the License. */ #include "paddle/pten/common/data_type.h" #include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/core.h" namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index a0067f9c64fb1..a8c1da2a8b866 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -31,7 +31,6 @@ limitations under the License. */ #include "paddle/pten/common/data_type.h" #include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/core.h" namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 71b8bbbb1a283..038a1254d7ef6 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -28,7 +28,6 @@ limitations under the License. */ #include "paddle/pten/common/data_type.h" #include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/core.h" #pragma GCC diagnostic ignored "-Wwrite-strings" namespace paddle { diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 9849d0d41611b..c1049d240795c 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/pten/common/data_type.h" #include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/core.h" namespace paddle { namespace pybind { diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt index 6a823ff3672bf..a9b7c7581bc2b 100644 --- a/paddle/pten/CMakeLists.txt +++ b/paddle/pten/CMakeLists.txt @@ -29,4 +29,4 @@ get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS) message(STATUS "All standard pten kernels: ${pten_kernels}") set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels}) -cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS}) +cc_library(pten DEPS ${PTEN_DEPS}) diff --git a/paddle/pten/all.cc b/paddle/pten/all.cc deleted file mode 100644 index d8d96e1cd461e..0000000000000 --- a/paddle/pten/all.cc +++ /dev/null @@ -1,17 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/pten/all.h" - -namespace pten {} // namespace pten diff --git a/paddle/pten/all.h b/paddle/pten/all.h deleted file mode 100644 index c8be629b10e75..0000000000000 --- a/paddle/pten/all.h +++ /dev/null @@ -1,20 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// developer apis -#include "paddle/pten/include/core.h" -#include "paddle/pten/include/infermeta.h" -#include "paddle/pten/include/math.h" diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc index ddb29c8833f3b..6eb1e5a3797c9 100644 --- a/paddle/pten/api/lib/utils.cc +++ b/paddle/pten/api/lib/utils.cc @@ -22,8 +22,7 @@ limitations under the License. */ #include "paddle/pten/api/lib/kernel_dispatch.h" #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/pten/include/core.h" -#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/infermeta/unary.h" PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); diff --git a/paddle/pten/include/core.h b/paddle/pten/include/core.h deleted file mode 100644 index 9a042753d1f73..0000000000000 --- a/paddle/pten/include/core.h +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// See Note: [ How do we organize the kernel directory ] -#include "paddle/pten/core/convert_utils.h" -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/core/kernel_context.h" -#include "paddle/pten/core/kernel_factory.h" -#include "paddle/pten/core/tensor_meta.h" diff --git a/paddle/pten/include/infermeta.h b/paddle/pten/include/infermeta.h deleted file mode 100644 index 5e356dd37c03e..0000000000000 --- a/paddle/pten/include/infermeta.h +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// See Note: [ How do we organize the kernel directory ] -#include "paddle/pten/infermeta/binary.h" -#include "paddle/pten/infermeta/multiary.h" -#include "paddle/pten/infermeta/nullary.h" -#include "paddle/pten/infermeta/unary.h" diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h deleted file mode 100644 index a4fb7f4d98faf..0000000000000 --- a/paddle/pten/include/math.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// See Note: [ How do we organize the kernel directory ] -#include "paddle/pten/api/lib/utils/storage.h" -#include "paddle/pten/include/infermeta.h" -#include "paddle/pten/kernels/scale_kernel.h" - -namespace pten { - -template -DenseTensor Scale(const ContextT& dev_ctx, - const DenseTensor& x, - const Scalar& scale, - float bias, - bool bias_after_scale) { - auto out_meta = UnchangedInferMeta(x.meta()); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - Scale(dev_ctx, x, scale, bias, bias_after_scale, &dense_out); - return dense_out; -} - -} // namespace pten diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h index 9dd3d457e4a26..b6074f117ea14 100644 --- a/paddle/pten/kernels/complex_kernel.h +++ b/paddle/pten/kernels/complex_kernel.h @@ -15,9 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/infermeta.h" -#include "paddle/pten/kernels/empty_kernel.h" - #include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/empty_kernel.h" diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc index fe9a0a033bced..0582fb87b4457 100644 --- a/paddle/pten/kernels/cpu/scale_kernel.cc +++ b/paddle/pten/kernels/cpu/scale_kernel.cc @@ -13,18 +13,48 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/pten/kernels/scale_kernel.h" -#include "paddle/pten/kernels/impl/scale_kernel_impl.h" #include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/funcs/eigen/common.h" // See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/bfloat16.h" +namespace pten { + +template +void ScaleKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + DenseTensor* out) { + // calc + out->mutable_data(); + auto eigen_out = pten::EigenVector::Flatten(*out); + auto eigen_x = pten::EigenVector::Flatten(x); + auto& dev = *dev_ctx.eigen_device(); + // TODO(chenweihang): now the eigen function here need the dtype of scale, + // eigen_x, bias should be same, so here need cast for two scalar arg, + // maybe we declare that the type of scale and bias is T? + paddle::operators::EigenScale, T>::Eval( + dev, + eigen_out, + eigen_x, + scale.to(), + static_cast(bias), + bias_after_scale); +} + +} // namespace pten PT_REGISTER_CTX_KERNEL(scale, CPU, ALL_LAYOUT, - pten::Scale, + pten::ScaleKernel, float, double, paddle::platform::bfloat16, diff --git a/paddle/pten/kernels/flatten_kernel.h b/paddle/pten/kernels/flatten_kernel.h index a67e66fac4130..c974fda1ed363 100644 --- a/paddle/pten/kernels/flatten_kernel.h +++ b/paddle/pten/kernels/flatten_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/empty_kernel.h" namespace pten { diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu index 68574c063e77f..ff7e2a6ed284c 100644 --- a/paddle/pten/kernels/gpu/scale_kernel.cu +++ b/paddle/pten/kernels/gpu/scale_kernel.cu @@ -44,12 +44,12 @@ struct ScaleFunctor { }; template -void Scale(const ContextT& dev_ctx, - const DenseTensor& x, - const Scalar& scale, - float bias, - bool bias_after_scale, - DenseTensor* out) { +void ScaleKernel(const ContextT& dev_ctx, + const DenseTensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + DenseTensor* out) { std::vector inputs; std::vector outputs; inputs.emplace_back(&x); @@ -67,7 +67,7 @@ void Scale(const ContextT& dev_ctx, PT_REGISTER_CTX_KERNEL(scale, GPU, ALL_LAYOUT, - pten::Scale, + pten::ScaleKernel, float, double, paddle::platform::float16, diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h index 802cc019d78c5..b1bae78ddc5fa 100644 --- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h @@ -14,8 +14,7 @@ limitations under the License. */ #pragma once -// #include "paddle/pten/kernels/complex_kernel.h" -#include "paddle/pten/include/math.h" +#include "paddle/pten/kernels/complex_kernel.h" #include "paddle/pten/kernels/empty_kernel.h" #include "paddle/pten/kernels/impl/dot_grad_kernel_impl.h" #include "paddle/pten/kernels/impl/matmul_kernel_impl.h" diff --git a/paddle/pten/kernels/impl/scale_kernel_impl.h b/paddle/pten/kernels/impl/scale_kernel_impl.h deleted file mode 100644 index 2e0b158b36b8d..0000000000000 --- a/paddle/pten/kernels/impl/scale_kernel_impl.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/common/scalar.h" -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/funcs/eigen/common.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace pten { - -template -void Scale(const Context& dev_ctx, - const DenseTensor& x, - const Scalar& scale, - float bias, - bool bias_after_scale, - DenseTensor* out) { - // calc - out->mutable_data(); - auto eigen_out = pten::EigenVector::Flatten(*out); - auto eigen_x = pten::EigenVector::Flatten(x); - auto& dev = *dev_ctx.eigen_device(); - // TODO(chenweihang): now the eigen function here need the dtype of scale, - // eigen_x, bias should be same, so here need cast for two scalar arg, - // maybe we declare that the type of scale and bias is T? - paddle::operators::EigenScale, T>::Eval( - dev, - eigen_out, - eigen_x, - scale.to(), - static_cast(bias), - bias_after_scale); -} - -} // namespace pten diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h index f87d0a31b470b..e01103fc5b847 100644 --- a/paddle/pten/kernels/math_kernel.h +++ b/paddle/pten/kernels/math_kernel.h @@ -16,7 +16,8 @@ limitations under the License. */ #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/infermeta/binary.h" +#include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/empty_kernel.h" namespace pten { diff --git a/paddle/pten/kernels/reshape_kernel.h b/paddle/pten/kernels/reshape_kernel.h index faa51c69ad17c..293f6cd2baf61 100644 --- a/paddle/pten/kernels/reshape_kernel.h +++ b/paddle/pten/kernels/reshape_kernel.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/pten/common/scalar_array.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/empty_kernel.h" namespace pten { diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h index 5908050029c7a..ba16db566b8bb 100644 --- a/paddle/pten/kernels/scale_kernel.h +++ b/paddle/pten/kernels/scale_kernel.h @@ -16,15 +16,29 @@ limitations under the License. */ #include "paddle/pten/common/scalar.h" #include "paddle/pten/core/dense_tensor.h" - +#include "paddle/pten/infermeta/unary.h" +#include "paddle/pten/kernels/empty_kernel.h" namespace pten { template -void Scale(const Context& dev_ctx, - const DenseTensor& x, - const Scalar& scale, - float bias, - bool bias_after_scale, - DenseTensor* out); +void ScaleKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + DenseTensor* out); + +template +DenseTensor Scale(const ContextT& dev_ctx, + const DenseTensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale) { + auto out_meta = UnchangedInferMeta(x.meta()); + auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); + ScaleKernel( + dev_ctx, x, scale, bias, bias_after_scale, &dense_out); + return dense_out; +} } // namespace pten diff --git a/paddle/pten/kernels/sign_kernel.h b/paddle/pten/kernels/sign_kernel.h index ba205fc96a15c..304b640d2af69 100644 --- a/paddle/pten/kernels/sign_kernel.h +++ b/paddle/pten/kernels/sign_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/empty_kernel.h" namespace pten { diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h index d525b305c7409..41143826c45d8 100644 --- a/paddle/pten/tests/api/scale_api.h +++ b/paddle/pten/tests/api/scale_api.h @@ -23,8 +23,7 @@ #include "paddle/pten/common/scalar.h" #include "paddle/pten/common/scalar_array.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/pten/include/core.h" -#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/scale_kernel.h" namespace paddle { @@ -92,42 +91,42 @@ static void ScaleCPU(DataType kernel_dtype, pten::DenseTensor* dense_out) { switch (kernel_dtype) { case pten::DataType::FLOAT64: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::FLOAT32: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::BFLOAT16: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::INT64: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::INT32: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::INT16: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::INT8: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::UINT8: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } @@ -151,42 +150,42 @@ static void ScaleGPU(DataType kernel_dtype, pten::DenseTensor* dense_out) { switch (kernel_dtype) { case pten::DataType::FLOAT64: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::FLOAT32: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::FLOAT16: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::INT64: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::INT32: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::INT16: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::INT8: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } case pten::DataType::UINT8: { - pten::Scale( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc index ac2922b36f205..fe26f56552b05 100644 --- a/paddle/pten/tests/kernels/test_scale_dev_api.cc +++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/pten/include/math.h" +#include "paddle/pten/kernels/scale_kernel.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 35720ae32fe38..e8539b11d1455 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -345,8 +345,10 @@ def source_include(header_file_path): #include "paddle/pten/api/lib/kernel_dispatch.h" #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/pten/include/core.h" -#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/infermeta/binary.h" +#include "paddle/pten/infermeta/multiary.h" +#include "paddle/pten/infermeta/nullary.h" +#include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/declarations.h" """ From 23aa7b08d18d9b6a3e80d6bc31d71b481719b0bd Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 13 Jan 2022 15:04:01 +0800 Subject: [PATCH 10/24] force close eager_generator.exe (#38896) * force close eager_generator.exe * modify according to zhouwei's comment --- paddle/scripts/paddle_build.bat | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index ca34b12b5d4f8..343ab8ff9f5b7 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -42,7 +42,11 @@ taskkill /f /im nvcc.exe /t 2>NUL taskkill /f /im cicc.exe /t 2>NUL taskkill /f /im ptxas.exe /t 2>NUL taskkill /f /im op_function_generator.exe /t 2>NUL +taskkill /f /im eager_generator.exe /t 2>NUL +taskkill /f /im eager_op_function_generator.exe /t 2>NUL wmic process where name="op_function_generator.exe" call terminate 2>NUL +wmic process where name="eager_generator.exe" call terminate 2>NUL +wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL wmic process where name="cvtres.exe" call terminate 2>NUL wmic process where name="rc.exe" call terminate 2>NUL wmic process where name="cl.exe" call terminate 2>NUL @@ -509,8 +513,12 @@ taskkill /f /im nvcc.exe /t 2>NUL taskkill /f /im cicc.exe /t 2>NUL taskkill /f /im ptxas.exe /t 2>NUL taskkill /f /im op_function_generator.exe /t 2>NUL -wmic process where name="cmake.exe" call terminate 2>NUL +taskkill /f /im eager_generator.exe /t 2>NUL +taskkill /f /im eager_op_function_generator.exe /t 2>NUL wmic process where name="op_function_generator.exe" call terminate 2>NUL +wmic process where name="eager_generator.exe" call terminate 2>NUL +wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL +wmic process where name="cmake.exe" call terminate 2>NUL wmic process where name="cvtres.exe" call terminate 2>NUL wmic process where name="rc.exe" call terminate 2>NUL wmic process where name="cl.exe" call terminate 2>NUL @@ -972,7 +980,11 @@ taskkill /f /im nvcc.exe /t 2>NUL taskkill /f /im cicc.exe /t 2>NUL taskkill /f /im ptxas.exe /t 2>NUL taskkill /f /im op_function_generator.exe /t 2>NUL +taskkill /f /im eager_generator.exe /t 2>NUL +taskkill /f /im eager_op_function_generator.exe /t 2>NUL wmic process where name="op_function_generator.exe" call terminate 2>NUL +wmic process where name="eager_generator.exe" call terminate 2>NUL +wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL wmic process where name="cvtres.exe" call terminate 2>NUL wmic process where name="rc.exe" call terminate 2>NUL wmic process where name="cl.exe" call terminate 2>NUL From 7a5af6306bb3f34ea951203e5e36419c0be9ac11 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Thu, 13 Jan 2022 16:31:19 +0800 Subject: [PATCH 11/24] [NPU] fix expand op (#38526) * [NPU] fix expand op * [NPU] optimize codes * [NPU] optimize codes --- paddle/fluid/operators/expand_op_npu.cc | 26 +++++++++++++++---- .../tests/unittests/npu/test_expand_op_npu.py | 21 +++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index 8ecdd5e8cb695..e9f31f8ddd698 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -81,14 +81,30 @@ class ExpandNPUKernel : public framework::OpKernel { out_dims[i] *= expand_times[i]; } - out0->Resize(out_dims); - out0->mutable_data(context.device_context().GetPlace()); - const auto& runner = - NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); + auto place = context.GetPlace(); auto stream = context.template device_context() .stream(); - runner.Run(stream); + + out0->Resize(out_dims); + out0->mutable_data(place); + + bool is_expand_times_all_one = + (out0->numel() == in0->numel()) ? true : false; + + if (is_expand_times_all_one) { + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), + out0->mutable_data(place), + BOOST_GET_CONST(platform::NPUPlace, place), in0->data(), + in0->numel() * sizeof(T), stream); + if (out_dims != in_dims) { + out0->Resize(out_dims); + } + } else { + const auto& runner = + NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); + runner.Run(stream); + } } }; } // namespace operators diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py index 375003f79e500..89ac9e09aa348 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py @@ -132,5 +132,26 @@ def test_npu(self): self.assertTrue(np.allclose(npu_loss, cpu_loss)) +# ------------------------------------------------ +# Special Cases for NPU +# ------------------------------------------------ + + +class TestExpand_expand_times_all_one(TestExpand): + def setUp(self): + self.set_npu() + self.op_type = "expand" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.randn(3, 1, 7).astype(self.dtype) + out = np.tile(x, [1, 1, 1]) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'expand_times': [1, 1, 1]} + self.outputs = {'Out': out} + + if __name__ == '__main__': unittest.main() From eaccdc71dd04b1f42ceac170c82754dd0a953867 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Thu, 13 Jan 2022 16:34:17 +0800 Subject: [PATCH 12/24] [NPU] fix tril_triu (#38864) [NPU] fix tril_triu --- paddle/fluid/operators/tril_triu_op_npu.cc | 41 ++++++++++++++++--- .../unittests/npu/test_tril_triu_op_npu.py | 16 +++++++- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc index ab7a9035fb974..02af711567f84 100644 --- a/paddle/fluid/operators/tril_triu_op_npu.cc +++ b/paddle/fluid/operators/tril_triu_op_npu.cc @@ -33,12 +33,41 @@ class TrilTriuNPUKernel : public framework::OpKernel { framework::NPUAttributeMap attr_input = {{"diagonal", diagonal}}; - auto stream = - ctx.template device_context() - .stream(); + const auto& dev_ctx = + ctx.template device_context(); - const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input); - runner.Run(stream); + auto op_func_tril = [](const std::vector& inputs, + const std::vector& outputs, + const NPUAttributeMap& attrs, + const platform::NPUDeviceContext& dev_ctx) { + const auto& runner = NpuOpRunner("Tril", inputs, outputs, attrs); + runner.Run(dev_ctx.stream()); + }; + + auto op_func_triu = [](const std::vector& inputs, + const std::vector& outputs, + const NPUAttributeMap& attrs, + const platform::NPUDeviceContext& dev_ctx) { + const auto& runner = NpuOpRunner("Triu", inputs, outputs, attrs); + runner.Run(dev_ctx.stream()); + }; + + if (x->type() == framework::proto::VarType::BOOL) { + if (lower) { + NpuOpRunner::TypeAdapter({*x}, {*out}, attr_input, dev_ctx, + op_func_tril, + {framework::proto::VarType::UINT8}, + {framework::proto::VarType::UINT8}); + } else { + NpuOpRunner::TypeAdapter({*x}, {*out}, attr_input, dev_ctx, + op_func_triu, + {framework::proto::VarType::UINT8}, + {framework::proto::VarType::UINT8}); + } + } else { + const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input); + runner.Run(dev_ctx.stream()); + } } }; @@ -49,4 +78,6 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL( tril_triu, ops::TrilTriuNPUKernel, + ops::TrilTriuNPUKernel, + ops::TrilTriuNPUKernel, ops::TrilTriuNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py index 13adc25a38ca5..8239dd4f3fa89 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid import paddle.tensor as tensor @@ -187,5 +187,19 @@ def test_fluid_api(self): fetch_list=[triu_out]) +# @skip_check_grad_ci(reason="[NPU does not support grad right now.") +class TestNPUTrilTriu_bool(TestNPUTrilTriu): + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_dtype(self): + self.dtype = np.bool + + def initTestCase(self): + self.real_op_type = np.random.choice(['triu', 'tril']) + self.diagonal = None + self.X = np.random.choice([False, True], size=(100)).reshape([10, -1]) + + if __name__ == '__main__': unittest.main() From 7f1234563ff3aab32168a6fbaeb57d73748981c3 Mon Sep 17 00:00:00 2001 From: shangliang Xu Date: Thu, 13 Jan 2022 17:24:53 +0800 Subject: [PATCH 13/24] [bug fix] fix unfold bug in compile time (#38907) --- paddle/fluid/operators/unfold_op.cc | 35 +++++++++++++---------------- paddle/fluid/operators/unfold_op.h | 10 +-------- 2 files changed, 16 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc index 3f580884aa515..5a8e7e3efbe82 100644 --- a/paddle/fluid/operators/unfold_op.cc +++ b/paddle/fluid/operators/unfold_op.cc @@ -143,22 +143,18 @@ class UnfoldOp : public framework::OperatorWithKernel { "but recieved dilations_height: %d dilations_width: %d.", dilations[0], dilations[1])); - bool contain_unknown_dim = framework::contain_unknown_dim(in_dims); - bool check = ctx->IsRuntime() || !contain_unknown_dim; - if (check) { - std::vector out_dims; - out_dims.push_back(in_dims[0]); - - int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1]; - out_dims.push_back(output_channels); - - int output_height = - CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0], - paddings[2], strides[0]); - int output_width = - CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1], paddings[1], - paddings[3], strides[1]); - // check output height and width + std::vector out_dims; + out_dims.push_back(in_dims[0]); + int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1]; + out_dims.push_back(output_channels); + + int output_height = + CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0], + paddings[2], strides[0]); + int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1], + paddings[1], paddings[3], strides[1]); + if (ctx->IsRuntime()) { + // only check output height and width in runtime PADDLE_ENFORCE_GT( output_height, 0, platform::errors::InvalidArgument( @@ -179,11 +175,10 @@ class UnfoldOp : public framework::OperatorWithKernel { in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1], strides[0], strides[1], dilations[0], dilations[1], output_height, output_width)); - int output_col_length = output_height * output_width; - out_dims.push_back(output_col_length); - - ctx->SetOutputDim("Y", framework::make_ddim(out_dims)); } + int output_col_length = output_height * output_width; + out_dims.push_back(output_col_length); + ctx->SetOutputDim("Y", framework::make_ddim(out_dims)); } protected: diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h index f22559f1f38c2..006e4822fead0 100644 --- a/paddle/fluid/operators/unfold_op.h +++ b/paddle/fluid/operators/unfold_op.h @@ -16,6 +16,7 @@ #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/math_function.h" @@ -29,15 +30,6 @@ inline int CalcOutputSize(int input_size, int filter_size, int dilation, int padding1, int padding2, int stride) { const int dkernel = dilation * (filter_size - 1) + 1; int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1; - PADDLE_ENFORCE_GT( - output_size, 0UL, - platform::errors::InvalidArgument( - "Due to the settings of padding(%d, %d), filter_size(%d), " - "dilation(%d) and " - "stride(%d), the output size is less than 0, please check " - "again. Input_size:%d", - padding1, padding2, filter_size, dilation, stride, input_size)); - return output_size; } From dccdc719ebd863db342c3ef1c8794be2ee391348 Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Thu, 13 Jan 2022 19:33:45 +0800 Subject: [PATCH 14/24] [Paddle-Inference] add Paddle Trt config: with_interleaved (#38884) * add Paddle Trt config: with_interleaved --- paddle/fluid/inference/analysis/argument.h | 1 + .../inference/analysis/ir_pass_manager.cc | 2 + .../ir_passes/tensorrt_subgraph_pass.cc | 1 + paddle/fluid/inference/api/analysis_config.cc | 3 + .../fluid/inference/api/analysis_predictor.cc | 7 +++ .../inference/api/paddle_analysis_config.h | 2 + paddle/fluid/inference/api/paddle_api.h | 21 +++++++ .../inference/api/paddle_inference_api.h | 16 ----- .../tensorrt/convert/batch_norm_op.cc | 17 ++++-- .../tensorrt/convert/elementwise_op.cc | 14 +++-- .../inference/tensorrt/convert/gather_op.cc | 2 + .../inference/tensorrt/convert/op_converter.h | 58 ++++++++++++------- .../inference/tensorrt/convert/scale_op.cc | 16 +++++ .../inference/tensorrt/convert/slice_op.cc | 30 +++++----- paddle/fluid/inference/tensorrt/engine.h | 5 ++ 15 files changed, 136 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index aff2f60551de9..175bc55dcff17 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -212,6 +212,7 @@ struct Argument { bool); DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool); + DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool); DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path, TensorRtShapeRangeInfoPath, std::string); DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index dcbbee97a772c..3abda782ab6cf 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -108,6 +108,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("enable_int8", new bool(enable_int8)); pass->Set("use_calib_mode", new bool(use_calib_mode)); pass->Set("use_oss", new bool(argument->tensorrt_use_oss())); + pass->Set("with_interleaved", + new bool(argument->tensorrt_with_interleaved())); pass->Set("precision_mode", new AnalysisConfig::Precision(precision_mode)); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index a21118e23aa5c..ef50df3084f8c 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -369,6 +369,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( Get("gpu_device_id"), min_input_shape, max_input_shape, opt_input_shape, disable_trt_plugin_fp16); trt_engine->SetUseOSS(Get("use_oss")); + trt_engine->SetWithInterleaved(Get("with_interleaved")); trt_engine->SetUseDLA(Get("trt_use_dla")); trt_engine->SetDLACore(Get("trt_dla_core")); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index a1ab69906bfc4..273690719336c 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -189,6 +189,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(trt_use_static_engine_); CP_MEMBER(trt_use_calib_mode_); CP_MEMBER(trt_use_oss_); + CP_MEMBER(trt_with_interleaved_); CP_MEMBER(trt_tuned_dynamic_shape_); CP_MEMBER(trt_allow_build_at_runtime_); CP_MEMBER(collect_shape_range_info_); @@ -864,6 +865,8 @@ std::string AnalysisConfig::Summary() { : "false"}); os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"}); + os.InsertRow({"tensorrt_with_interleaved", + trt_with_interleaved_ ? "true" : "false"}); os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"}); if (trt_use_dla_) { os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)}); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 929984f50a7b8..2799fb9e174d3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -605,6 +605,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_); argument_.SetTensorRtUseOSS(config_.trt_use_oss_); + argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_); argument_.SetMinInputShape(config_.min_input_shape_); argument_.SetMaxInputShape(config_.max_input_shape_); argument_.SetOptimInputShape(config_.optim_input_shape_); @@ -1603,5 +1604,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, #endif return false; } +void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c, + bool with_interleaved) { +#ifdef PADDLE_WITH_CUDA + c->trt_with_interleaved_ = with_interleaved; +#endif +} } // namespace experimental } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 77409f95b042e..f65170daccb62 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -796,6 +796,7 @@ struct PD_INFER_DECL AnalysisConfig { bool trt_use_static_engine_{false}; bool trt_use_calib_mode_{true}; bool trt_use_oss_{false}; + bool trt_with_interleaved_{false}; bool trt_use_dla_{false}; int trt_dla_core_{0}; std::map> min_input_shape_{}; @@ -883,6 +884,7 @@ struct PD_INFER_DECL AnalysisConfig { // So we release the memory when the predictor is set up. mutable bool is_valid_{true}; std::string opt_cache_dir_; + friend class paddle_infer::experimental::InternalUtils; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index b137b7ba6f97e..c129efe494b4f 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -405,3 +405,24 @@ PD_INFER_DECL std::shared_ptr MakeCipher( const std::string& config_file); } // namespace paddle + +// forward declation +using cudaStream_t = struct CUstream_st*; +using hipStream_t = struct ihipStream_t*; + +namespace paddle_infer { +class Predictor; +using Config = paddle::AnalysisConfig; +namespace experimental { +class PD_INFER_DECL InternalUtils { + public: + // Note: Can only be used under thread_local semantics. + static bool RunWithExternalStream(paddle_infer::Predictor* pred, + cudaStream_t stream); + static bool RunWithExternalStream(paddle_infer::Predictor* pred, + hipStream_t stream); + static void UpdateConfigInterleaved(paddle_infer::Config* c, + bool with_interleaved); +}; +} // namespace experimental +} // namespace paddle_infer diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index b2b9f2e407478..65906a57f46cb 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -41,27 +41,11 @@ limitations under the License. */ /// \since 2.0.0-beta /// -// forward declation -using cudaStream_t = struct CUstream_st*; -using hipStream_t = struct ihipStream_t*; - namespace paddle_infer { using PrecisionType = paddle::AnalysisConfig::Precision; using Config = paddle::AnalysisConfig; -class Predictor; -namespace experimental { -class PD_INFER_DECL InternalUtils { - public: - // Note: Can only be used under thread_local semantics. - static bool RunWithExternalStream(paddle_infer::Predictor* pred, - cudaStream_t stream); - static bool RunWithExternalStream(paddle_infer::Predictor* pred, - hipStream_t stream); -}; -} // namespace experimental - /// /// \class Predictor /// diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 71a2fa68f1749..0e66165191474 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -45,7 +45,7 @@ class BatchNormOpConverter : public OpConverter { auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front()); auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front()); const float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); - + auto output_name = op_desc.Output("Y").front(); PADDLE_ENFORCE_NOT_NULL( Bias_v, platform::errors::NotFound( @@ -145,6 +145,10 @@ class BatchNormOpConverter : public OpConverter { expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); expand_layer->setReshapeDimensions(expand_shape); X = expand_layer->getOutput(0); + expand_layer->getOutput(0)->setName( + ("reshape_before_batchnorm_out: " + output_name).c_str()); + expand_layer->setName( + ("BN_Shuffle: (Output: " + output_name + ")").c_str()); } layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *X, @@ -152,12 +156,13 @@ class BatchNormOpConverter : public OpConverter { shift_weights.get(), scale_weights.get(), power_weights.get(), dynamic_shape_offset); - auto output_name = op_desc.Output("Y").front(); engine_->SetWeights(op_desc.Input("Bias").front(), std::move(combile_bias_tensor)); engine_->SetWeights(op_desc.Input("Scale").front(), std::move(combile_scale_tensor)); if (x_dim.nbDims < 3 + dynamic_shape_offset) { + layer->getOutput(0)->setName("batch_norm_out"); + layer->setName(("BN: ScaleNd: (Output: " + output_name + ")").c_str()); nvinfer1::Dims squeeze_shape; squeeze_shape.nbDims = x_dim.nbDims; for (int i = 0; i < squeeze_shape.nbDims; i++) { @@ -166,10 +171,12 @@ class BatchNormOpConverter : public OpConverter { squeeze_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); squeeze_layer->setReshapeDimensions(squeeze_shape); - layer = static_cast(squeeze_layer); + RreplenishLayerAndOutput(squeeze_layer, "batchnorm_add_scale", + {output_name}, test_mode); + } else { + RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name}, + test_mode); } - RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name}, - test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 7c5af43816c44..33f732c19a875 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -50,6 +50,7 @@ class ElementwiseWeightOpConverter : public OpConverter { op_desc.Input("Y").front().c_str())); auto* Y_t = Y_v->GetMutable(); float* weight_data = nullptr; + auto output_name = op_desc.Output("Out")[0]; weight_data = engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false); nvinfer1::Dims dims_x = X->getDimensions(); @@ -80,6 +81,10 @@ class ElementwiseWeightOpConverter : public OpConverter { expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); expand_layer->setReshapeDimensions(expand_shape); X = expand_layer->getOutput(0); + expand_layer->getOutput(0)->setName( + ("elementwise_reshape_out: " + output_name).c_str()); + expand_layer->setName( + ("Elewise: Shuffle: (Output: " + output_name + ")").c_str()); } if (op_type_ == "add") { nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( @@ -101,11 +106,12 @@ class ElementwiseWeightOpConverter : public OpConverter { squeeze_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); squeeze_layer->setReshapeDimensions(squeeze_shape); - layer = static_cast(squeeze_layer); + RreplenishLayerAndOutput(squeeze_layer, "elementwise_" + op_type_, + {output_name}, test_mode); + } else { + RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, + {output_name}, test_mode); } - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name}, - test_mode); if (op_desc.HasAttr("enable_int8")) { #if IS_TRT_VERSION_GE(5000) CHECK(op_desc.HasAttr("X_scale")); diff --git a/paddle/fluid/inference/tensorrt/convert/gather_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_op.cc index e7b82388b6ab8..a98e7535de1b8 100644 --- a/paddle/fluid/inference/tensorrt/convert/gather_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc @@ -56,6 +56,8 @@ class GatherOpConverter : public OpConverter { index_shape.d[0] = -1; reshape_layer->setReshapeDimensions(index_shape); + reshape_layer->setName( + ("Gather: Shuffle: (Output: " + output_name + ")").c_str()); auto layer = TRT_ENGINE_ADD_LAYER(engine_, Gather, *input_tensor, *reshape_layer->getOutput(0), axis); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 57a26aec6ebcb..7e0c8bf1da177 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -144,28 +144,44 @@ class OpConverter { it->SetEngine(engine); (*it)(op, scope, test_mode); - bool has_out_scale = op_desc.HasAttr("out_threshold"); - if (has_out_scale) { - float out_scale = - BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); - std::string output_name = ""; - if (op_desc.HasOutput("Output")) { - output_name = op_desc.Output("Output").front(); - } else if (op_desc.HasOutput("Out")) { - output_name = op_desc.Output("Out").front(); - } else if (op_desc.HasOutput("Y")) { - output_name = op_desc.Output("Y").front(); - } else { - PADDLE_THROW( - platform::errors::NotFound("Op %s has out threshold but doesn't " - "have an output named \"Output\", " - "\"Out\" or \"Y\".", - op_desc.Type())); + size_t output_num = op_desc.OutputNames().size(); + if (output_num == 1) { // The number of output is 1 + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + std::string output_name = ""; + if (op_desc.HasOutput("Output")) { + output_name = op_desc.Output("Output").front(); + } else if (op_desc.HasOutput("Out")) { + output_name = op_desc.Output("Out").front(); + } else if (op_desc.HasOutput("Y")) { + output_name = op_desc.Output("Y").front(); + } else { + PADDLE_THROW( + platform::errors::NotFound("Op %s has out threshold but doesn't " + "have an output named \"Output\", " + "\"Out\" or \"Y\".", + op_desc.Type())); + } + auto* output_itensor = engine->GetITensor(output_name); + engine->SetTensorDynamicRange(output_itensor, out_scale); + VLOG(1) << "Set out scale = " << out_scale << " for tensor " + << output_name << "."; + } + } else if (output_num > 1) { // The number of outputs greater than 1 + for (size_t i = 0; i < output_num; ++i) { + if (op_desc.HasAttr("out_" + std::to_string(i) + "_threshold")) { + float out_scale = BOOST_GET_CONST( + float, + op_desc.GetAttr("out_" + std::to_string(i) + "_threshold")); + std::string output_name = + op_desc.Output(op_desc.OutputNames()[i]).front(); + auto* output_itensor = engine->GetITensor(output_name); + engine->SetTensorDynamicRange(output_itensor, out_scale); + VLOG(1) << "Set out scale = " << out_scale << " for tensor " + << output_name << "."; + } } - auto* output_itensor = engine->GetITensor(output_name); - engine->SetTensorDynamicRange(output_itensor, out_scale); - VLOG(1) << "Set out scale = " << out_scale << " for tensor " - << output_name << "."; } } diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc index b527f2db53808..8b23a8161f593 100644 --- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc @@ -89,21 +89,34 @@ class ScaleOpConverter : public OpConverter { expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); expand_layer->setReshapeDimensions(expand_shape); input = expand_layer->getOutput(0); + expand_layer->getOutput(0)->setName( + ("before_reshape_out: " + out_name).c_str()); + expand_layer->setName( + ("Scale: before_reshape (Output: " + out_name + ")").c_str()); } if (bias_after_scale) { layer = TRT_ENGINE_ADD_LAYER( engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(), power_weights.get()); + layer->getOutput(0)->setName( + ("bias_after_scale_out: " + out_name).c_str()); + layer->setName(("Scale: scale (Output: " + out_name + ")").c_str()); } else { // add bias layer = TRT_ENGINE_ADD_LAYER( engine_, Scale, *(input), nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), power_weights.get(), power_weights.get()); + layer->getOutput(0)->setName( + ("bias_before_scale:bias_out: " + out_name).c_str()); + layer->setName(("Scale: scale_bias (Output: " + out_name + ")").c_str()); // mul scale layer = TRT_ENGINE_ADD_LAYER( engine_, Scale, *(layer->getOutput(0)), nvinfer1::ScaleMode::kUNIFORM, power_weights.get(), scale_weights.get(), power_weights.get()); + layer->getOutput(0)->setName( + ("bias_before_scale:scale_out: " + out_name).c_str()); + layer->setName(("Scale: scale_scale (Output: " + out_name + ")").c_str()); } PADDLE_ENFORCE_EQ(layer != nullptr, true, @@ -119,6 +132,9 @@ class ScaleOpConverter : public OpConverter { TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); squeeze_layer->setReshapeDimensions(squeeze_shape); layer = static_cast(squeeze_layer); + layer->getOutput(0)->setName(("after_reshape_out: " + out_name).c_str()); + layer->setName( + ("Scale: Shuffle_reshape (Output: " + out_name + ")").c_str()); } RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode); } diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 7f270b1f390b7..2c08f0fe2bded 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -30,10 +30,11 @@ class SliceOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); + auto output_name = op_desc.Output("Out")[0]; + float out_scale = 1; if (op_desc.HasAttr("out_threshold")) { - float out_scale = - BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); engine_->SetTensorDynamicRange(input, out_scale); } @@ -71,12 +72,22 @@ class SliceOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { -#if IS_TRT_VERSION_GE(6000) if (engine_->use_oss() && engine_->with_ernie()) { std::vector plugin_inputs; - // plugin_inputs.emplace_back(trans_layer->getOutput(0)); - plugin_inputs.emplace_back(input); - + if (engine_->with_interleaved()) { + auto* shuffler_slice = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + nvinfer1::Permutation transpose_embed{2, 1, 0, 3}; + shuffler_slice->setSecondTranspose(transpose_embed); + engine_->SetTensorDynamicRange(shuffler_slice->getOutput(0), + out_scale); + shuffler_slice->setName( + ("SpecialSlice_interleaved: Shuffle: (Output: " + output_name + + ")") + .c_str()); + plugin_inputs.emplace_back(shuffler_slice->getOutput(0)); + } else { + plugin_inputs.emplace_back(input); + } std::string pos_name; if (engine_->Has("ernie_pos_name")) { pos_name = engine_->Get("ernie_pos_name"); @@ -99,11 +110,6 @@ class SliceOpConverter : public OpConverter { new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16); layer = engine_->AddDynamicPlugin(&input, 1, plugin); } -#else - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); -#endif } else { bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); @@ -111,8 +117,6 @@ class SliceOpConverter : public OpConverter { new plugin::SlicePlugin(starts, ends, axes, with_fp16); layer = engine_->AddPlugin(&input, 1, plugin); } - - auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "slice", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 7aaeb739de194..663534feda1a8 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -407,6 +407,9 @@ class TensorRTEngine { void SetUseDLA(bool use_dla) { use_dla_ = use_dla; } void SetDLACore(int dla_core) { dla_core_ = dla_core; } void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; } + void SetWithInterleaved(bool with_interleaved) { + with_interleaved_ = with_interleaved; + } void ClearWeights() { for (auto& weight_pair : weight_map) { @@ -480,6 +483,7 @@ class TensorRTEngine { bool use_oss() { return use_oss_; } bool with_ernie() { return with_ernie_; } + bool with_interleaved() { return with_interleaved_; } bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; } bool with_dynamic_shape() { return with_dynamic_shape_; } AnalysisConfig::Precision precision() { return precision_; } @@ -612,6 +616,7 @@ class TensorRTEngine { bool use_dla_{false}; int dla_core_{0}; bool with_ernie_{false}; + bool with_interleaved_{false}; nvinfer1::ILogger& logger_; // max data size for the buffers. From 158bf13f1c133c6af77674560e33413be552d51f Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 13 Jan 2022 19:52:33 +0800 Subject: [PATCH 15/24] [PTen] Rename kernel register marco (#38861) * rename register marco * fix error changing * fix format error --- cmake/pten_kernel.cmake | 6 +- paddle/pten/core/kernel_registry.h | 820 +++--------------- paddle/pten/kernels/cpu/cast_kernel.cc | 30 +- paddle/pten/kernels/cpu/complex_kernel.cc | 20 +- paddle/pten/kernels/cpu/dot_grad_kernel.cc | 20 +- paddle/pten/kernels/cpu/dot_kernel.cc | 20 +- paddle/pten/kernels/cpu/full_kernel.cc | 50 +- paddle/pten/kernels/cpu/math_kernel.cc | 108 +-- paddle/pten/kernels/cpu/matmul_grad_kernel.cc | 52 +- paddle/pten/kernels/cpu/matmul_kernel.cc | 16 +- paddle/pten/kernels/cpu/scale_kernel.cc | 24 +- paddle/pten/kernels/cpu/sign_kernel.cc | 3 +- paddle/pten/kernels/empty_kernel.cc | 116 +-- paddle/pten/kernels/flatten_grad_kernel.cc | 60 +- paddle/pten/kernels/flatten_kernel.cc | 120 +-- paddle/pten/kernels/gpu/cast_kernel.cu | 36 +- paddle/pten/kernels/gpu/complex_kernel.cu | 22 +- paddle/pten/kernels/gpu/dot_grad_kernel.cu | 20 +- paddle/pten/kernels/gpu/dot_kernel.cu | 20 +- paddle/pten/kernels/gpu/full_kernel.cu | 48 +- paddle/pten/kernels/gpu/math_kernel.cu | 116 +-- paddle/pten/kernels/gpu/matmul_grad_kernel.cu | 58 +- paddle/pten/kernels/gpu/matmul_kernel.cu | 18 +- paddle/pten/kernels/gpu/scale_kernel.cu | 24 +- paddle/pten/kernels/gpu/sign_kernel.cu | 2 +- 25 files changed, 636 insertions(+), 1193 deletions(-) diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake index f962c1332093a..bc9fefb58f452 100644 --- a/cmake/pten_kernel.cmake +++ b/cmake/pten_kernel.cmake @@ -16,12 +16,12 @@ function(kernel_declare TARGET_LIST) foreach(kernel_path ${TARGET_LIST}) file(READ ${kernel_path} kernel_impl) - # TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL + # TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL # NOTE(chenweihang): now we don't recommend to use digit in kernel name - string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}") + string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}") if (NOT first_registry STREQUAL "") # parse the first kernel name - string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}") + string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}") string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}") string(REPLACE "," "" kernel_name "${kernel_name}") string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}") diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h index f08ef4acfd9ce..194ab52d25688 100644 --- a/paddle/pten/core/kernel_registry.h +++ b/paddle/pten/core/kernel_registry.h @@ -213,20 +213,20 @@ struct KernelRegistrar { * pointer of the corresponding data type is automatically instantiated * during registration. * - * Note: `1TA` means `1 template argument` + * Note: `2TA` means `2 template argument` */ #define PT_REGISTER_KERNEL( \ kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \ "PT_REGISTER_KERNEL must be called in global namespace."); \ - _PT_REGISTER_1TA_KERNEL( \ + _PT_REGISTER_2TA_KERNEL( \ kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__) #ifndef _WIN32 -#define _PT_REGISTER_1TA_KERNEL( \ +#define _PT_REGISTER_2TA_KERNEL( \ kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__); \ + PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__); \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ ::pten::Kernel*); \ PT_KERNEL_REGISTRAR_INIT( \ @@ -252,7 +252,7 @@ struct KernelRegistrar { * * And msvc can work without template instantiation */ -#define _PT_REGISTER_1TA_KERNEL( \ +#define _PT_REGISTER_2TA_KERNEL( \ kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ ::pten::Kernel*); \ @@ -268,60 +268,76 @@ struct KernelRegistrar { ::pten::Kernel* kernel) #endif -#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \ - _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__), \ - meta_kernel_fn, \ - cpp_dtype, \ +#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ...) \ + _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__), \ + meta_kernel_fn, \ + backend, \ + cpp_dtype, \ __VA_ARGS__) -#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \ - PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \ - (meta_kernel_fn, cpp_dtype, __VA_ARGS__) +#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, cpp_dtype, ...) \ + PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \ + (meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__) -#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn -#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn +#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \ + template decltype(meta_kernel_fn) \ + meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__)) #define PT_KERNEL_REGISTRAR_INIT( \ kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \ @@ -373,10 +389,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; } #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \ backend, \ @@ -393,10 +410,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \ backend, \ layout, \ @@ -419,10 +437,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \ backend, \ layout, \ @@ -445,10 +464,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \ backend, \ layout, \ @@ -471,10 +491,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \ backend, \ layout, \ @@ -497,10 +518,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \ backend, \ layout, \ @@ -523,10 +545,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \ backend, \ layout, \ @@ -549,10 +572,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \ backend, \ layout, \ @@ -575,10 +599,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \ backend, \ layout, \ @@ -601,10 +626,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \ backend, \ layout, \ @@ -627,10 +653,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \ backend, \ layout, \ @@ -653,10 +680,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \ backend, \ layout, \ @@ -679,10 +707,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \ backend, \ layout, \ @@ -705,10 +734,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \ backend, \ layout, \ @@ -731,10 +761,11 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ + &meta_kernel_fn)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL( \ + meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \ backend, \ layout, \ @@ -743,41 +774,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) -/** PT_REGISTER_NO_TEMPLATE_KERNEL - * - * Basic Kernel register marco, used to register a no template argument kernel - * function, pass in the complete function pointe of the kernel, this - * registration macro will not do automatic template instantiation. - * - * Note: developer maybe register 2 kernel with same name, backend and diff - * layout, so the layout also need to be a part of symbol var name. If developer - * register 2 kernel with same name, backend, layout and diff dtype, he should - * use another register marco PT_REGISTER_KERNEL. - * - * TODO(chenweihang): remove this marco later - */ -#define PT_REGISTER_NO_TEMPLATE_KERNEL( \ - kernel_name, backend, layout, kernel_fn, dtype) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \ - "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \ - static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel*); \ - static const ::pten::KernelRegistrar \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::pten::KernelArgsParseFunctor::Parse, \ - &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ - PT_KERNEL(kernel_fn), \ - PT_VARIADIC_KERNEL(kernel_fn)); \ - int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { \ - return 0; \ - } \ - void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel* kernel) - /** PT_REGISTER_GENERAL_KERNEL * * Basic Kernel register marco, used to register a instantiated kernel function @@ -832,558 +828,6 @@ struct KernelRegistrar { ::pten::Kernel* kernel) #endif -/** PT_REGISTER_CTX_KERNEL - * - * Used for kernel registration with device context and data type as - * template parameter. - */ -#define PT_REGISTER_CTX_KERNEL( \ - kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_register_tp_ctx_kernel_ns_check_##kernel_name##_##backend##_##layout, \ - "PT_REGISTER_CTX_KERNEL must be called in global namespace."); \ - _PT_REGISTER_2TA_KERNEL( \ - kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__) - -#ifndef _WIN32 -#define _PT_REGISTER_2TA_KERNEL( \ - kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - PT_KERNEL_INSTANTIATION2(meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__); \ - static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel*); \ - PT_KERNEL_REGISTRAR_INIT2( \ - kernel_name, \ - backend, \ - layout, \ - &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__); \ - void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel* kernel) -#else -#define _PT_REGISTER_2TA_KERNEL( \ - kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel*); \ - PT_KERNEL_REGISTRAR_INIT2( \ - kernel_name, \ - backend, \ - layout, \ - &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__); \ - void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel* kernel) -#endif - -#define PT_KERNEL_INSTANTIATION2(meta_kernel_fn, backend, cpp_dtype, ...) \ - _PT_KERNEL_INSTANTIATION2(PT_NARGS(cpp_dtype, __VA_ARGS__), \ - meta_kernel_fn, \ - backend, \ - cpp_dtype, \ - __VA_ARGS__) - -#define _PT_KERNEL_INSTANTIATION2(N, meta_kernel_fn, backend, cpp_dtype, ...) \ - PT_CONCATENATE(_PT_KERNEL_INSTANTIATION2_, N) \ - (meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__) - -#define _PT_KERNEL_INSTANTIATION2_1(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn -#define _PT_KERNEL_INSTANTIATION2_2(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_1(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_3(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_2(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_4(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_3(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_5(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_4(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_6(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_5(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_7(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_6(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_8(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_7(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_9(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_8(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_10(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_9(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_11(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_10(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_12(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_11(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_13(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_12(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_14(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_13(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION2_15(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION2_14(meta_kernel_fn, backend, __VA_ARGS__)) - -#define PT_KERNEL_REGISTRAR_INIT2( \ - kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \ - _PT_KERNEL_REGISTRAR_INIT2(PT_NARGS(cpp_dtype, __VA_ARGS__), \ - kernel_name, \ - backend, \ - layout, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__) - -// clang-format off - -/* The =pre-commit always treats this macro into the wrong format, - and multi-line macros cannot be skipped with NOLINT.*/ -#define _PT_KERNEL_REGISTRAR_INIT2(N, \ - kernel_name, \ - backend, \ - layout, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT2_, N) ( \ - kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__) - -// clang-format on - -#define _PT_KERNEL_REGISTRAR_INIT2_1(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; } -#define _PT_KERNEL_REGISTRAR_INIT2_2(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_1(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_3(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_2(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_4(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_3(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_5(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_4(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_6(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_5(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_7(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_6(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_8(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_7(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_9(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_8(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_10(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_9(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_11(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_10(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_12(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_11(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_13(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_12(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_14(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_13(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT2_15(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::pten::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL( \ - meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_14(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - __VA_ARGS__)) - /** PT_DECLARE_KERNEL * * Used to export the symbols of the file where the kernel is located, diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc index c6736cdd1bcf0..a0006f49a2b38 100644 --- a/paddle/pten/kernels/cpu/cast_kernel.cc +++ b/paddle/pten/kernels/cpu/cast_kernel.cc @@ -58,20 +58,20 @@ void CastKernel(const Context& dev_ctx, } // namespace pten -PT_REGISTER_CTX_KERNEL(cast, - CPU, - ALL_LAYOUT, - pten::CastKernel, - float, - double, - int, - int64_t, - int16_t, - bool, - uint8_t, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) { +PT_REGISTER_KERNEL(cast, + CPU, + ALL_LAYOUT, + pten::CastKernel, + float, + double, + int, + int64_t, + int16_t, + bool, + uint8_t, + paddle::platform::float16, + paddle::platform::bfloat16, + paddle::platform::complex, + paddle::platform::complex) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } diff --git a/paddle/pten/kernels/cpu/complex_kernel.cc b/paddle/pten/kernels/cpu/complex_kernel.cc index 10e7e684db3c1..59a7577153a61 100644 --- a/paddle/pten/kernels/cpu/complex_kernel.cc +++ b/paddle/pten/kernels/cpu/complex_kernel.cc @@ -21,13 +21,13 @@ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/platform/complex.h" -PT_REGISTER_CTX_KERNEL(conj, - CPU, - ALL_LAYOUT, - pten::ConjKernel, - paddle::platform::complex, - paddle::platform::complex, - float, - double, - int, - int64_t) {} +PT_REGISTER_KERNEL(conj, + CPU, + ALL_LAYOUT, + pten::ConjKernel, + paddle::platform::complex, + paddle::platform::complex, + float, + double, + int, + int64_t) {} diff --git a/paddle/pten/kernels/cpu/dot_grad_kernel.cc b/paddle/pten/kernels/cpu/dot_grad_kernel.cc index c9d5c35e134c8..ed927f820f0e7 100644 --- a/paddle/pten/kernels/cpu/dot_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc @@ -20,13 +20,13 @@ #include "paddle/fluid/platform/complex.h" -PT_REGISTER_CTX_KERNEL(dot_grad, - CPU, - ALL_LAYOUT, - pten::DotGradKernel, - float, - double, - int, - int64_t, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(dot_grad, + CPU, + ALL_LAYOUT, + pten::DotGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc index 72e9e28907f90..0baf9ba0a8bdd 100644 --- a/paddle/pten/kernels/cpu/dot_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_kernel.cc @@ -49,13 +49,13 @@ void DotKernel(const Context& dev_ctx, using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_CTX_KERNEL(dot, - CPU, - ALL_LAYOUT, - pten::DotKernel, - float, - double, - int, - int64_t, - complex64, - complex128) {} +PT_REGISTER_KERNEL(dot, + CPU, + ALL_LAYOUT, + pten::DotKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} diff --git a/paddle/pten/kernels/cpu/full_kernel.cc b/paddle/pten/kernels/cpu/full_kernel.cc index 1ae8001d79dc7..919471d86ac53 100644 --- a/paddle/pten/kernels/cpu/full_kernel.cc +++ b/paddle/pten/kernels/cpu/full_kernel.cc @@ -18,29 +18,29 @@ limitations under the License. */ #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/impl/full_kernel_impl.h" -PT_REGISTER_CTX_KERNEL(full, - CPU, - ALL_LAYOUT, - pten::FullKernel, - float, - double, - uint8_t, - int16_t, - int, - int64_t, - bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(full, + CPU, + ALL_LAYOUT, + pten::FullKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t, + bool, + paddle::platform::float16, + paddle::platform::bfloat16, + paddle::platform::complex, + paddle::platform::complex) {} -PT_REGISTER_CTX_KERNEL(full_like, - CPU, - ALL_LAYOUT, - pten::FullLikeKernel, - float, - double, - int, - int64_t, - bool, - paddle::platform::float16) {} +PT_REGISTER_KERNEL(full_like, + CPU, + ALL_LAYOUT, + pten::FullLikeKernel, + float, + double, + int, + int64_t, + bool, + paddle::platform::float16) {} diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index be0d52355bce6..83388d0d9a80f 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -118,60 +118,60 @@ using complex128 = ::paddle::platform::complex; // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 // using bfloat16 = ::paddle::platform::bfloat16; -PT_REGISTER_CTX_KERNEL( +PT_REGISTER_KERNEL( mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {} -PT_REGISTER_CTX_KERNEL(add, - CPU, - ALL_LAYOUT, - pten::AddKernel, - float, - double, - int, - int64_t, - complex64, - complex128) {} -PT_REGISTER_CTX_KERNEL(subtract, - CPU, - ALL_LAYOUT, - pten::SubtractKernel, - float, - double, - int, - int64_t, - complex64, - complex128) {} -PT_REGISTER_CTX_KERNEL(divide, - CPU, - ALL_LAYOUT, - pten::DivideKernel, - float, - double, - int, - int64_t, - complex64, - complex128) {} -PT_REGISTER_CTX_KERNEL(multiply, - CPU, - ALL_LAYOUT, - pten::MultiplyKernel, - float, - double, - int, - int64_t, - bool, - complex64, - complex128) {} -PT_REGISTER_CTX_KERNEL(sum, - CPU, - ALL_LAYOUT, - pten::SumKernel, - bool, - float, - double, - paddle::platform::float16, - int, - int64_t, - complex64, - complex128) { +PT_REGISTER_KERNEL(add, + CPU, + ALL_LAYOUT, + pten::AddKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_KERNEL(subtract, + CPU, + ALL_LAYOUT, + pten::SubtractKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_KERNEL(divide, + CPU, + ALL_LAYOUT, + pten::DivideKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_KERNEL(multiply, + CPU, + ALL_LAYOUT, + pten::MultiplyKernel, + float, + double, + int, + int64_t, + bool, + complex64, + complex128) {} +PT_REGISTER_KERNEL(sum, + CPU, + ALL_LAYOUT, + pten::SumKernel, + bool, + float, + double, + paddle::platform::float16, + int, + int64_t, + complex64, + complex128) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } diff --git a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc index 5a8abb6701b0e..4738e21573194 100644 --- a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc @@ -19,29 +19,29 @@ limitations under the License. */ #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h" -PT_REGISTER_CTX_KERNEL(matmul_grad, - CPU, - ALL_LAYOUT, - pten::MatmulGradKernel, - float, - double, - paddle::platform::complex, - paddle::platform::complex) {} - -PT_REGISTER_CTX_KERNEL(matmul_double_grad, - CPU, - ALL_LAYOUT, - pten::MatmulDoubleGradKernel, - float, - double, - paddle::platform::complex, - paddle::platform::complex) {} - -PT_REGISTER_CTX_KERNEL(matmul_triple_grad, - CPU, - ALL_LAYOUT, - pten::MatmulTripleGradKernel, - float, - double, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(matmul_grad, + CPU, + ALL_LAYOUT, + pten::MatmulGradKernel, + float, + double, + paddle::platform::complex, + paddle::platform::complex) {} + +PT_REGISTER_KERNEL(matmul_double_grad, + CPU, + ALL_LAYOUT, + pten::MatmulDoubleGradKernel, + float, + double, + paddle::platform::complex, + paddle::platform::complex) {} + +PT_REGISTER_KERNEL(matmul_triple_grad, + CPU, + ALL_LAYOUT, + pten::MatmulTripleGradKernel, + float, + double, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/cpu/matmul_kernel.cc b/paddle/pten/kernels/cpu/matmul_kernel.cc index edba402ec1d84..f749e9cb27979 100644 --- a/paddle/pten/kernels/cpu/matmul_kernel.cc +++ b/paddle/pten/kernels/cpu/matmul_kernel.cc @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/fluid/platform/complex.h" #include "paddle/pten/kernels/impl/matmul_kernel_impl.h" -PT_REGISTER_CTX_KERNEL(matmul, - CPU, - ALL_LAYOUT, - pten::MatmulKernel, - float, - double, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(matmul, + CPU, + ALL_LAYOUT, + pten::MatmulKernel, + float, + double, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc index 0582fb87b4457..7088bba01aa78 100644 --- a/paddle/pten/kernels/cpu/scale_kernel.cc +++ b/paddle/pten/kernels/cpu/scale_kernel.cc @@ -51,15 +51,15 @@ void ScaleKernel(const Context& dev_ctx, } // namespace pten -PT_REGISTER_CTX_KERNEL(scale, - CPU, - ALL_LAYOUT, - pten::ScaleKernel, - float, - double, - paddle::platform::bfloat16, - uint8_t, - int8_t, - int16_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(scale, + CPU, + ALL_LAYOUT, + pten::ScaleKernel, + float, + double, + paddle::platform::bfloat16, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} diff --git a/paddle/pten/kernels/cpu/sign_kernel.cc b/paddle/pten/kernels/cpu/sign_kernel.cc index a7b62822d6e0f..25fa2bb5fe4ef 100644 --- a/paddle/pten/kernels/cpu/sign_kernel.cc +++ b/paddle/pten/kernels/cpu/sign_kernel.cc @@ -21,5 +21,4 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/platform/bfloat16.h" -PT_REGISTER_CTX_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) { -} +PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {} diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index 2dd55a13e38e5..eb67ed6655f47 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -34,66 +34,66 @@ void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out) { } // namespace pten -PT_REGISTER_CTX_KERNEL(empty, - CPU, - ALL_LAYOUT, - pten::EmptyKernel, - float, - double, - uint8_t, - int16_t, - int, - int64_t, - bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(empty, + CPU, + ALL_LAYOUT, + pten::EmptyKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t, + bool, + paddle::platform::float16, + paddle::platform::bfloat16, + paddle::platform::complex, + paddle::platform::complex) {} -PT_REGISTER_CTX_KERNEL(empty_like, - CPU, - ALL_LAYOUT, - pten::EmptyLikeKernel, - float, - double, - uint8_t, - int16_t, - int, - int64_t, - bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(empty_like, + CPU, + ALL_LAYOUT, + pten::EmptyLikeKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t, + bool, + paddle::platform::float16, + paddle::platform::bfloat16, + paddle::platform::complex, + paddle::platform::complex) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_REGISTER_CTX_KERNEL(empty, - GPU, - ALL_LAYOUT, - pten::EmptyKernel, - float, - double, - uint8_t, - int16_t, - int, - int64_t, - bool, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(empty, + GPU, + ALL_LAYOUT, + pten::EmptyKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t, + bool, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} -PT_REGISTER_CTX_KERNEL(empty_like, - GPU, - ALL_LAYOUT, - pten::EmptyLikeKernel, - float, - double, - uint8_t, - int16_t, - int, - int64_t, - bool, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(empty_like, + GPU, + ALL_LAYOUT, + pten::EmptyLikeKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t, + bool, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} #endif diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc index d6aea31748d6c..45f3c6558d9c8 100644 --- a/paddle/pten/kernels/flatten_grad_kernel.cc +++ b/paddle/pten/kernels/flatten_grad_kernel.cc @@ -33,41 +33,41 @@ void FlattenGradKernel(const Context& dev_ctx, } // namespace pten -PT_REGISTER_CTX_KERNEL(flatten_grad, - CPU, - ALL_LAYOUT, - pten::FlattenGradKernel, - float, - double, - uint8_t, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten_grad, + CPU, + ALL_LAYOUT, + pten::FlattenGradKernel, + float, + double, + uint8_t, + int8_t, + int, + int64_t) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_REGISTER_CTX_KERNEL(flatten_grad, - GPU, - ALL_LAYOUT, - pten::FlattenGradKernel, - float, - paddle::platform::float16, - double, - uint8_t, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten_grad, + GPU, + ALL_LAYOUT, + pten::FlattenGradKernel, + float, + paddle::platform::float16, + double, + uint8_t, + int8_t, + int, + int64_t) {} #endif #ifdef PADDLE_WITH_XPU -PT_REGISTER_CTX_KERNEL(flatten_grad, - XPU, - ALL_LAYOUT, - pten::FlattenGradKernel, - float, - paddle::platform::float16, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten_grad, + XPU, + ALL_LAYOUT, + pten::FlattenGradKernel, + float, + paddle::platform::float16, + int8_t, + int, + int64_t) {} #endif diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc index b284d3690830f..9201a8df9d166 100644 --- a/paddle/pten/kernels/flatten_kernel.cc +++ b/paddle/pten/kernels/flatten_kernel.cc @@ -48,72 +48,72 @@ void FlattenWithXShape(const Context& dev_ctx, } // namespace pten -PT_REGISTER_CTX_KERNEL(flatten, - CPU, - ALL_LAYOUT, - pten::FlattenKernel, - float, - double, - uint8_t, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten, + CPU, + ALL_LAYOUT, + pten::FlattenKernel, + float, + double, + uint8_t, + int8_t, + int, + int64_t) {} -PT_REGISTER_CTX_KERNEL(flatten_with_xshape, - CPU, - ALL_LAYOUT, - pten::FlattenWithXShape, - float, - double, - uint8_t, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten_with_xshape, + CPU, + ALL_LAYOUT, + pten::FlattenWithXShape, + float, + double, + uint8_t, + int8_t, + int, + int64_t) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_REGISTER_CTX_KERNEL(flatten, - GPU, - ALL_LAYOUT, - pten::FlattenKernel, - float, - paddle::platform::float16, - double, - uint8_t, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten, + GPU, + ALL_LAYOUT, + pten::FlattenKernel, + float, + paddle::platform::float16, + double, + uint8_t, + int8_t, + int, + int64_t) {} -PT_REGISTER_CTX_KERNEL(flatten_with_xshape, - GPU, - ALL_LAYOUT, - pten::FlattenWithXShape, - float, - paddle::platform::float16, - double, - uint8_t, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten_with_xshape, + GPU, + ALL_LAYOUT, + pten::FlattenWithXShape, + float, + paddle::platform::float16, + double, + uint8_t, + int8_t, + int, + int64_t) {} #endif #ifdef PADDLE_WITH_XPU -PT_REGISTER_CTX_KERNEL(flatten, - XPU, - ALL_LAYOUT, - pten::FlattenKernel, - float, - paddle::platform::float16, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten, + XPU, + ALL_LAYOUT, + pten::FlattenKernel, + float, + paddle::platform::float16, + int8_t, + int, + int64_t) {} -PT_REGISTER_CTX_KERNEL(flatten_with_xshape, - XPU, - ALL_LAYOUT, - pten::FlattenWithXShape, - float, - paddle::platform::float16, - int8_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(flatten_with_xshape, + XPU, + ALL_LAYOUT, + pten::FlattenWithXShape, + float, + paddle::platform::float16, + int8_t, + int, + int64_t) {} #endif diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu index 0bbe7a3a132d1..2f91c94ba5f75 100644 --- a/paddle/pten/kernels/gpu/cast_kernel.cu +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -60,24 +60,24 @@ void CastKernel(const Context& dev_ctx, } // namespace pten -#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \ - PT_REGISTER_CTX_KERNEL(cast, \ - GPU, \ - ALL_LAYOUT, \ - pten::CastKernel, \ - float, \ - double, \ - int, \ - int64_t, \ - int16_t, \ - bool, \ - uint8_t, \ - paddle::platform::float16, \ - paddle::platform::complex, \ - paddle::platform::complex, \ - ##__VA_ARGS__) { \ - kernel->OutputAt(0).SetDataType( \ - paddle::experimental::DataType::UNDEFINED); \ +#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \ + PT_REGISTER_KERNEL(cast, \ + GPU, \ + ALL_LAYOUT, \ + pten::CastKernel, \ + float, \ + double, \ + int, \ + int64_t, \ + int16_t, \ + bool, \ + uint8_t, \ + paddle::platform::float16, \ + paddle::platform::complex, \ + paddle::platform::complex, \ + ##__VA_ARGS__) { \ + kernel->OutputAt(0).SetDataType( \ + paddle::experimental::DataType::UNDEFINED); \ } #if !defined(PADDLE_WITH_HIP) diff --git a/paddle/pten/kernels/gpu/complex_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu index 02f050f5bc838..1c82077793e0a 100644 --- a/paddle/pten/kernels/gpu/complex_kernel.cu +++ b/paddle/pten/kernels/gpu/complex_kernel.cu @@ -21,14 +21,14 @@ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/platform/complex.h" -PT_REGISTER_CTX_KERNEL(conj, - GPU, - ALL_LAYOUT, - pten::ConjKernel, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex, - float, - double, - int, - int64_t) {} +PT_REGISTER_KERNEL(conj, + GPU, + ALL_LAYOUT, + pten::ConjKernel, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex, + float, + double, + int, + int64_t) {} diff --git a/paddle/pten/kernels/gpu/dot_grad_kernel.cu b/paddle/pten/kernels/gpu/dot_grad_kernel.cu index 42af96f7c7265..4b0d7fed4c9fd 100644 --- a/paddle/pten/kernels/gpu/dot_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu @@ -20,13 +20,13 @@ limitations under the License. */ #include "paddle/fluid/platform/complex.h" -PT_REGISTER_CTX_KERNEL(dot_grad, - GPU, - ALL_LAYOUT, - pten::DotGradKernel, - float, - double, - int, - int64_t, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(dot_grad, + GPU, + ALL_LAYOUT, + pten::DotGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu index 08d8f83c408de..18bab5c15a058 100644 --- a/paddle/pten/kernels/gpu/dot_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_kernel.cu @@ -52,13 +52,13 @@ void DotKernel(const Context& dev_ctx, using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_CTX_KERNEL(dot, - GPU, - ALL_LAYOUT, - pten::DotKernel, - float, - double, - int, - int64_t, - complex64, - complex128) {} +PT_REGISTER_KERNEL(dot, + GPU, + ALL_LAYOUT, + pten::DotKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu index ae1f8529db3de..2f6346daa888f 100644 --- a/paddle/pten/kernels/gpu/full_kernel.cu +++ b/paddle/pten/kernels/gpu/full_kernel.cu @@ -18,28 +18,28 @@ limitations under the License. */ #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/impl/full_kernel_impl.h" -PT_REGISTER_CTX_KERNEL(full, - GPU, - ALL_LAYOUT, - pten::FullKernel, - float, - double, - uint8_t, - int16_t, - int, - int64_t, - bool, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(full, + GPU, + ALL_LAYOUT, + pten::FullKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t, + bool, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} -PT_REGISTER_CTX_KERNEL(full_like, - GPU, - ALL_LAYOUT, - pten::FullLikeKernel, - float, - double, - int, - int64_t, - bool, - paddle::platform::float16) {} +PT_REGISTER_KERNEL(full_like, + GPU, + ALL_LAYOUT, + pten::FullLikeKernel, + float, + double, + int, + int64_t, + bool, + paddle::platform::float16) {} diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 557080638038d..1fd085ab5fe40 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -110,64 +110,64 @@ using float16 = paddle::platform::float16; using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_CTX_KERNEL( +PT_REGISTER_KERNEL( mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {} -PT_REGISTER_CTX_KERNEL(add, - GPU, - ALL_LAYOUT, - pten::AddKernel, - float, - double, - int, - int64_t, - float16, - complex64, - complex128) {} -PT_REGISTER_CTX_KERNEL(subtract, - GPU, - ALL_LAYOUT, - pten::SubtractKernel, - float, - double, - int, - int64_t, - float16, - complex64, - complex128) {} -PT_REGISTER_CTX_KERNEL(divide, - GPU, - ALL_LAYOUT, - pten::DivideKernel, - float, - double, - int, - int64_t, - float16, - complex64, - complex128) {} -PT_REGISTER_CTX_KERNEL(multiply, - GPU, - ALL_LAYOUT, - pten::MultiplyKernel, - float, - double, - int, - int64_t, - bool, - float16, - complex64, - complex128) {} -PT_REGISTER_CTX_KERNEL(sum, - GPU, - ALL_LAYOUT, - pten::SumKernel, - bool, - float, - double, - float16, - int, - int64_t, - complex64, - complex128) { +PT_REGISTER_KERNEL(add, + GPU, + ALL_LAYOUT, + pten::AddKernel, + float, + double, + int, + int64_t, + float16, + complex64, + complex128) {} +PT_REGISTER_KERNEL(subtract, + GPU, + ALL_LAYOUT, + pten::SubtractKernel, + float, + double, + int, + int64_t, + float16, + complex64, + complex128) {} +PT_REGISTER_KERNEL(divide, + GPU, + ALL_LAYOUT, + pten::DivideKernel, + float, + double, + int, + int64_t, + float16, + complex64, + complex128) {} +PT_REGISTER_KERNEL(multiply, + GPU, + ALL_LAYOUT, + pten::MultiplyKernel, + float, + double, + int, + int64_t, + bool, + float16, + complex64, + complex128) {} +PT_REGISTER_KERNEL(sum, + GPU, + ALL_LAYOUT, + pten::SumKernel, + bool, + float, + double, + float16, + int, + int64_t, + complex64, + complex128) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu index f20c3f82c9262..993b17f6b8ed0 100644 --- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu @@ -19,32 +19,32 @@ limitations under the License. */ #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h" -PT_REGISTER_CTX_KERNEL(matmul_grad, - GPU, - ALL_LAYOUT, - pten::MatmulGradKernel, - float, - double, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} - -PT_REGISTER_CTX_KERNEL(matmul_double_grad, - GPU, - ALL_LAYOUT, - pten::MatmulDoubleGradKernel, - float, - double, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} - -PT_REGISTER_CTX_KERNEL(matmul_triple_grad, - GPU, - ALL_LAYOUT, - pten::MatmulTripleGradKernel, - float, - double, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(matmul_grad, + GPU, + ALL_LAYOUT, + pten::MatmulGradKernel, + float, + double, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} + +PT_REGISTER_KERNEL(matmul_double_grad, + GPU, + ALL_LAYOUT, + pten::MatmulDoubleGradKernel, + float, + double, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} + +PT_REGISTER_KERNEL(matmul_triple_grad, + GPU, + ALL_LAYOUT, + pten::MatmulTripleGradKernel, + float, + double, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu index debda455818a9..a3ab88913a3b6 100644 --- a/paddle/pten/kernels/gpu/matmul_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_kernel.cu @@ -20,12 +20,12 @@ limitations under the License. */ #include "paddle/fluid/platform/complex.h" #include "paddle/pten/kernels/impl/matmul_kernel_impl.h" -PT_REGISTER_CTX_KERNEL(matmul, - GPU, - ALL_LAYOUT, - pten::MatmulKernel, - float, - double, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} +PT_REGISTER_KERNEL(matmul, + GPU, + ALL_LAYOUT, + pten::MatmulKernel, + float, + double, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu index ff7e2a6ed284c..4d63701413cd6 100644 --- a/paddle/pten/kernels/gpu/scale_kernel.cu +++ b/paddle/pten/kernels/gpu/scale_kernel.cu @@ -64,15 +64,15 @@ void ScaleKernel(const ContextT& dev_ctx, } // namespace pten -PT_REGISTER_CTX_KERNEL(scale, - GPU, - ALL_LAYOUT, - pten::ScaleKernel, - float, - double, - paddle::platform::float16, - uint8_t, - int8_t, - int16_t, - int, - int64_t) {} +PT_REGISTER_KERNEL(scale, + GPU, + ALL_LAYOUT, + pten::ScaleKernel, + float, + double, + paddle::platform::float16, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} diff --git a/paddle/pten/kernels/gpu/sign_kernel.cu b/paddle/pten/kernels/gpu/sign_kernel.cu index e7eb7e46861c8..16356507dc8ea 100644 --- a/paddle/pten/kernels/gpu/sign_kernel.cu +++ b/paddle/pten/kernels/gpu/sign_kernel.cu @@ -23,5 +23,5 @@ limitations under the License. */ using float16 = paddle::platform::float16; -PT_REGISTER_CTX_KERNEL( +PT_REGISTER_KERNEL( sign, GPU, ALL_LAYOUT, pten::SignKernel, float, double, float16) {} From 9ff989aeae54472f766bc6ffef8a13111ca8da51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Fri, 14 Jan 2022 11:26:01 +0800 Subject: [PATCH 16/24] remove interface: DenseTensor::release, test=develop (#38937) --- paddle/fluid/pybind/eager_method.cc | 2 +- paddle/pten/api/lib/utils/tensor_utils.cc | 6 ++---- paddle/pten/core/dense_tensor.h | 6 ------ paddle/pten/tests/core/test_dense_tensor.cc | 7 ------- 4 files changed, 3 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index a8c1da2a8b866..46b56f27ff98e 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -189,7 +189,7 @@ static PyObject* eager_tensor__clear_gradient(EagerTensorObject* self, << " is initialized, will be released."; auto dense_tensor = std::dynamic_pointer_cast(grad->impl()); - dense_tensor->release(); + dense_tensor->MoveMemoryHolder(); } Py_INCREF(Py_None); return Py_None; diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc index 0b6cb8d95cc1a..53d641896e43f 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.cc +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -306,10 +306,8 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) { "The destination Tensor is nullptr when move storage.")); dst->Resize(src->dims()); dst->set_type(pten::TransToProtoVarType(src->dtype())); - auto storage = src->release(); - std::shared_ptr holder( - new TensorStorage(std::move(storage))); - dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype())); + auto storage = src->MoveMemoryHolder(); + dst->ResetHolderWithType(storage, pten::TransToProtoVarType(src->dtype())); dst->set_offset(src->meta().offset); } diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 1802a2461158f..4f25fc296724c 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -172,12 +172,6 @@ class DenseTensor : public TensorBase, /// \return The actual storage size occupied by tensor. size_t capacity() const { return storage_->size(); } - /// \brief Release the storage area for other purposes. Because of the - /// destruction of encapsulation, we do not support two dense tensors directly - /// sharing the same intrusive pointer. - /// \return The rvalue of instrusize pointer releated to the released storage. - intrusive_ptr release() { return std::move(storage_); } - /// \brief Get the mutable data pointer value of type T. /// Memory allocation may occur when calling this interface: /// 1. When the storage size is not enough to meet the current shape of the diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc index c6db228c2b757..8277c0d8dadb7 100644 --- a/paddle/pten/tests/core/test_dense_tensor.cc +++ b/paddle/pten/tests/core/test_dense_tensor.cc @@ -116,9 +116,6 @@ TEST(dense_tensor, resize) { CHECK_EQ(tensor_0.capacity(), 6u); tensor_0.mutable_data(); CHECK_EQ(tensor_0.capacity(), 6u); - - auto storage = tensor_0.release(); - CHECK_EQ(storage->size(), 6u); } TEST(dense_tensor, shallow_copy) { @@ -133,10 +130,6 @@ TEST(dense_tensor, shallow_copy) { DenseTensor tensor_1(tensor_0); CHECK(tensor_0.meta() == tensor_1.meta()); - - // Copy constructor: Now shares the underlying shared_ptr instead - // of Storage - CHECK(tensor_0.release() != tensor_1.release()); } } // namespace tests From 9e0686ed45f79bbe6a5434bf453509cab0b630ea Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Fri, 14 Jan 2022 11:29:37 +0800 Subject: [PATCH 17/24] fix bug of -DPADDLE_WITH_SSE3 not set when WITH_AVX AND AVX_FOUND even SSE3_FOUND (#38931) --- cmake/configure.cmake | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 32ba2ff3ac627..88e8dde8addbc 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -31,10 +31,12 @@ endif(NOT WITH_PROFILER) if(WITH_AVX AND AVX_FOUND) set(SIMD_FLAG ${AVX_FLAG}) add_definitions(-DPADDLE_WITH_AVX) -elseif(SSE3_FOUND) - if(NOT WIN32) - set(SIMD_FLAG ${SSE3_FLAG}) - endif() +elseif(SSE3_FOUND AND NOT WIN32) + set(SIMD_FLAG ${SSE3_FLAG}) +endif() + +if (SSE3_FOUND) + # TODO: Runtime detection should be used here. add_definitions(-DPADDLE_WITH_SSE3) endif() From 7f8d5bc8f02d10db46cce9a975db584528742ed7 Mon Sep 17 00:00:00 2001 From: qipengh Date: Fri, 14 Jan 2022 11:37:26 +0800 Subject: [PATCH 18/24] [MLU]Add mean and reduce_mean op (#38872) * [MLU]: add mean and reduce mean op * [MLU]add mlu pytest dir in CMakeLists.txt * [MLU]fix tensor data * [MLU]fix TensorToPyArray and license --- paddle/fluid/framework/tensor_util.cc | 40 +++- paddle/fluid/memory/detail/buddy_allocator.cc | 5 +- paddle/fluid/memory/memcpy.cc | 10 + paddle/fluid/operators/mean_op_mlu.cc | 127 ++++++++++++ paddle/fluid/operators/mlu/mlu_baseop.h | 15 +- .../reduce_ops/reduce_mean_op_mlu.cc | 127 ++++++++++++ paddle/fluid/pybind/tensor_py.h | 28 ++- .../fluid/tests/unittests/CMakeLists.txt | 4 + .../fluid/tests/unittests/mlu/CMakeLists.txt | 9 + .../tests/unittests/mlu/test_mean_op_mlu.py | 83 ++++++++ .../unittests/mlu/test_reduce_mean_op_mlu.py | 185 ++++++++++++++++++ .../tests/unittests/mlu/test_relu_op_mlu.py | 166 ++++++++++++++++ .../paddle/fluid/tests/unittests/op_test.py | 11 +- 13 files changed, 796 insertions(+), 14 deletions(-) create mode 100644 paddle/fluid/operators/mean_op_mlu.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 5fd581220097b..724e3cc1e2ee8 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -396,7 +396,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, TENSOR* dst) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; - if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place)) { + if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) || + platform::is_mlu_place(dst_place)) { dev_ctx = pool.Get(dst_place); } else { dev_ctx = pool.Get(src.place()); @@ -1048,6 +1049,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, #else PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); +#endif + } else if (platform::is_mlu_place(tensor.place())) { +#ifdef PADDLE_WITH_MLU + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& mlu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + BOOST_GET_CONST(platform::MLUPlace, tensor.place()), + reinterpret_cast(data), size_to_write, + mlu_dev_ctx.stream()); + mlu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "MLUPlace is not supported when not compiled with MLU")); #endif } else if (platform::is_npu_place(tensor.place())) { #ifdef PADDLE_WITH_ASCEND_CL @@ -1127,9 +1151,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_mlu_place(dev_ctx.GetPlace()) || platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ + defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -1148,6 +1174,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor, } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else if (platform::is_mlu_place(dev_ctx.GetPlace())) { + PADDLE_THROW(platform::errors::Unimplemented( + "MLUPlace is not supported when not compiled with MLU")); } else { PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported when not compiled with NPU")); @@ -1192,9 +1221,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_mlu_place(dev_ctx.GetPlace()) || platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ + defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -1213,6 +1244,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor, } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else if (platform::is_mlu_place(dev_ctx.GetPlace())) { + PADDLE_THROW(platform::errors::Unimplemented( + "MLUPlace is not supported when not compiled with MLU")); } else { PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported when not compiled with NPU")); diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 96fcd6254d885..b02fb6642be3f 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -231,9 +231,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( allocate_bytes = DeviceAllocateSize(&platform::NPUInitAllocSize, &platform::NPUReallocSize, request_bytes); #elif defined(PADDLE_WITH_MLU) - allocate_bytes = - DeviceAllocateSize(&platform::MLUInitAllocSize(), - &platform::MLUReallocSize(), request_bytes); + allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize, + &platform::MLUReallocSize, request_bytes); #endif // Allocate a new block diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index e6aed2c90dace..153e19a9f1450 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -508,6 +508,9 @@ void Copy(platform::CPUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU"); platform::MLUMemcpyD2HAsync(dst, src, num, stream); } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU"); @@ -530,6 +533,9 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU"); platform::MLUMemcpyH2DAsync(dst, src, num, stream); } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU"); @@ -554,6 +560,10 @@ void Copy(platform::MLUPlace dst_place, "MLUMemcpyD2DAsync(same_mlu):MLU->MLU"); platform::MLUMemcpyD2DAsync(dst, src, num, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU"); diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc new file mode 100644 index 0000000000000..9862c2bd95256 --- /dev/null +++ b/paddle/fluid/operators/mean_op_mlu.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/mean_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/platform/device/mlu/device_context.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +class MeanMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + const T* in_data = input->data(); + T* out_data = output->mutable_data(context.GetPlace()); + auto numel = input->numel(); + auto rank = input->dims().size(); + auto place = context.GetPlace(); + auto stream = context.template device_context().stream(); + + if (rank == 0) { // scalar + auto mlu_place = BOOST_GET(platform::MLUPlace, place); + memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T), + stream); + return; + } + + std::vector reduce_dims; + reduce_dims.reserve(rank); + for (decltype(rank) i = 0; i < rank; ++i) { + reduce_dims.push_back(i); + } + + MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input->type())); + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->type())); + + MLUCnnlReduceDesc reduction_desc( + reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(), + nullptr, input_desc.get(), + reinterpret_cast(in_data), 0 /*indices_size*/, + nullptr, nullptr, output_desc.get(), + reinterpret_cast(out_data)); + } +}; + +template +class MeanMLUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto output_grad = context.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE_EQ(output_grad->numel(), 1, + platform::errors::InvalidArgument( + "Mean Gradient Input Tensor len should be 1. But " + "received Out@Grad's elements num is %d.", + output_grad->numel())); + auto input_grad = context.Output(framework::GradVarName("X")); + input_grad->mutable_data(context.GetPlace()); + + auto in_data = output_grad->data(); + auto numel = input_grad->numel(); + auto rank = input_grad->dims().size(); + auto out_data = input_grad->data(); + auto place = context.GetPlace(); + auto stream = context.template device_context().stream(); + + if (rank == 0) { // scalar + auto mlu_place = BOOST_GET(platform::MLUPlace, place); + memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T), + stream); + return; + } + + // means + Tensor mean_var(output_grad->type()); + mean_var.mutable_data(input_grad->dims(), context.GetPlace()); + MLUCnnlTensorDesc mean_var_desc(mean_var, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(mean_var.type())); + auto value = static_cast(1.0 / static_cast(input_grad->numel())); + MLUCnnl::Fill(context, value, mean_var_desc.get(), GetBasePtr(&mean_var)); + + // means mul output_grad + MLUCnnlTensorDesc in_desc(*output_grad, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output_grad->type())); + MLUCnnlTensorDesc out_desc(*input_grad, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input_grad->type())); + + MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + + MLUCnnl::OpTensor(context, op_tensor_desc.get(), in_desc.get(), + reinterpret_cast(in_data), + mean_var_desc.get(), GetBasePtr(&mean_var), + out_desc.get(), reinterpret_cast(out_data), + ToCnnlDataType()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(mean, ops::MeanMLUKernel, + ops::MeanMLUKernel); +REGISTER_OP_MLU_KERNEL(mean_grad, ops::MeanMLUGradKernel, + ops::MeanMLUGradKernel); diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index ab398a92c2972..8082c45d14b95 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -45,12 +45,22 @@ enum MLULogicMethod { CNNL_LOGIC_OP_OR = 7, }; +inline const void* GetBasePtr(const Tensor* t) { return t->data(); } + +inline void* GetBasePtr(Tensor* t) { return t->data(); } + template inline cnnlDataType_t ToCnnlDataType(const T& t) { auto type = framework::ToDataType(t); return ToCnnlDataType(type); } +template +inline cnnlDataType_t ToCnnlDataType() { + auto type = framework::ToDataType(std::type_index(typeid(T))); + return ToCnnlDataType(type); +} + template <> inline cnnlDataType_t ToCnnlDataType(const framework::proto::VarType::Type& t) { cnnlDataType_t type = CNNL_DTYPE_FLOAT; @@ -89,11 +99,12 @@ NarrowT CheckedNarrowing(const WideT& wide) { return narrow; } -static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) { +inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) { return ctx.template device_context().cnnl_handle(); } -static const MLUDeviceContext& GetDevCtxFromCTX(const ExecutionContext& ctx) { +inline static const MLUDeviceContext& GetDevCtxFromCTX( + const ExecutionContext& ctx) { return ctx.template device_context(); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc new file mode 100644 index 0000000000000..ef7e9940f0590 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/platform/device/mlu/device_context.h" + +namespace paddle { +namespace operators { + +template +class ReduceMeanMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + auto input_dims = framework::vectorize(input->dims()); + const auto& input_dim_size = input->dims().size(); + std::vector reduce_dims; + if (reduce_all) { + for (size_t i = 0; i < input_dims.size(); i++) { + reduce_dims.push_back(static_cast(i)); + } + } else { + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) { + reduce_dims.push_back(dims[i] + input_dim_size); + } else { + reduce_dims.push_back(dims[i]); + } + } + } + + MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input->type())); + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->type())); + + MLUCnnlReduceDesc reduction_desc( + reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(), + nullptr, input_desc.get(), GetBasePtr(input), + 0 /*indices_size*/, nullptr, nullptr, output_desc.get(), + GetBasePtr(output)); + } +}; + +template +class ReduceMeanGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output_grad = context.Input(framework::GradVarName("Out")); + auto* input_grad = context.Output(framework::GradVarName("X")); + input_grad->mutable_data(context.GetPlace()); + + bool reduce_all = context.Attr("reduce_all"); + auto reduce_dims = context.Attr>("dim"); + auto input_dims = framework::vectorize(input->dims()); + + int reduce_numel = 1; + if (reduce_all) { + reduce_dims.clear(); + for (size_t d = 0; d < input_dims.size(); ++d) { + reduce_dims.push_back(static_cast(d)); + } + } + for (auto& d : reduce_dims) { + if (d < 0) { + d = d + input_dims.size(); + } + reduce_numel *= input_dims[d]; + } + + Tensor tmp_output_grad(output_grad->type()); + auto tmp_output_dims = input_dims; + for (auto d : reduce_dims) { + tmp_output_dims[d] = 1; + } + tmp_output_grad.ShareDataWith(*output_grad); + tmp_output_grad.Resize(framework::make_ddim(tmp_output_dims)); + + MLUCnnlTensorDesc output_grad_desc(tmp_output_grad, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(tmp_output_grad.type())); + MLUCnnlTensorDesc input_grad_desc(*input_grad, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input_grad->type())); + + auto value = static_cast(1.0 / static_cast(reduce_numel)); + MLUCnnl::Fill(context, value, input_grad_desc.get(), + GetBasePtr(input_grad)); + + MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + + MLUCnnl::OpTensor(context, op_tensor_desc.get(), output_grad_desc.get(), + GetBasePtr(&tmp_output_grad), input_grad_desc.get(), + GetBasePtr(input_grad), input_grad_desc.get(), + GetBasePtr(input_grad), ToCnnlDataType()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(reduce_mean, ops::ReduceMeanMLUKernel, + ops::ReduceMeanMLUKernel); +REGISTER_OP_MLU_KERNEL(reduce_mean_grad, ops::ReduceMeanGradMLUKernel, + ops::ReduceMeanGradMLUKernel); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index b31b7456ebca7..1fe6686919453 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -232,6 +232,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place()); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); +#endif + } else if (platform::is_mlu_place(self.place())) { +#ifdef PADDLE_WITH_MLU + const T *a = self.data(); + auto p = BOOST_GET_CONST(platform::MLUPlace, self.place()); + paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), + nullptr); #endif } else if (platform::is_npu_place(self.place())) { #if defined(PADDLE_WITH_ASCEND_CL) @@ -267,6 +274,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); +#endif + } else if (platform::is_mlu_place(self->place())) { +#ifdef PADDLE_WITH_MLU + auto p = BOOST_GET_CONST(platform::MLUPlace, self->place()); + T *a = self->mutable_data(p); + paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), + nullptr); #endif } else if (platform::is_npu_place(self->place())) { #if defined(PADDLE_WITH_ASCEND_CL) @@ -543,6 +557,11 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self, #ifdef PADDLE_WITH_XPU output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place), self.type()); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_MLU + output->mutable_data(BOOST_GET_CONST(platform::MLUPlace, place), + self.type()); #endif } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -845,8 +864,13 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, size_t copy_bytes = sizeof_dtype * numel; auto p = BOOST_GET_CONST(platform::MLUPlace, tensor.place()); - paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p, - tensor_buf_ptr, copy_bytes, nullptr); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(tensor.place()); + paddle::memory::Copy( + platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr, + copy_bytes, + reinterpret_cast(ctx).stream()); + ctx.Wait(); return py_arr; #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index b46a10c8c79d8..67697fcfd8398 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -803,6 +803,10 @@ if (WITH_MKLDNN) add_subdirectory(mkldnn) endif() +if (WITH_MLU) + add_subdirectory(mlu) +endif() + add_subdirectory(asp) add_subdirectory(ir) diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt new file mode 100644 index 0000000000000..8fcd3f196dc19 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt @@ -0,0 +1,9 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +if (WITH_MLU) + foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) + endforeach(TEST_OP) + +endif() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py new file mode 100644 index 0000000000000..36419327db6b0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + + +class TestMean(OpTest): + def setUp(self): + self.set_mlu() + self.place = paddle.device.MLUPlace(0) + self.op_type = "mean" + self.init_dtype() + + x = np.random.random([1, 100]).astype(self.dtype) + self.inputs = {'X': x} + + self.attrs = {} + np_out = np.mean(x) + self.outputs = {'Out': np_out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestMeanFP16(OpTest): + def setUp(self): + self.set_mlu() + self.place = paddle.MLUPlace(0) + self.op_type = "mean" + self.init_dtype() + + x = np.random.random([3, 200]).astype(self.dtype) + self.inputs = {'X': x} + + self.attrs = {} + np_out = np.mean(x) + self.outputs = {'Out': np_out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py new file mode 100644 index 0000000000000..c0be644c79115 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py @@ -0,0 +1,185 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +class TestMeanOp(OpTest): + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestMeanOp5D(TestMeanOp): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.inputs = { + 'X': np.random.random((1, 2, 5, 6, 10)).astype("float32") + } + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + +class TestMeanOp6D(TestMeanOp): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.inputs = { + 'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float32") + } + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + +class TestMeanOp8D(TestMeanOp): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.inputs = { + 'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float32") + } + self.attrs = {'dim': (0, 3)} + self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))} + + +class Test1DReduce(TestMeanOp): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random(120).astype("float32")} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + +class Test2DReduce0(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [0]} + self.inputs = {'X': np.random.random((20, 10)).astype("float32")} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + +class Test2DReduce1(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((20, 10)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce0(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce1(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce2(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [-2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce3(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [1, 2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class TestKeepDimReduce(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} + self.attrs = {'dim': [1], 'keep_dim': True} + self.outputs = { + 'Out': self.inputs['X'].mean( + axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim']) + } + + +class TestKeepDim8DReduce(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.inputs = { + 'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float32") + } + self.attrs = {'dim': (3, 4, 5), 'keep_dim': True} + self.outputs = { + 'Out': self.inputs['X'].mean( + axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim']) + } + + +class TestReduceAll(Test1DReduce): + def setUp(self): + self.set_mlu() + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")} + self.attrs = {'reduce_all': True} + self.outputs = {'Out': self.inputs['X'].mean()} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py new file mode 100644 index 0000000000000..25c50f67949e7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py @@ -0,0 +1,166 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestRelu(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "relu" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.rand(3, 2).astype(self.dtype) + out = x + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestReluFp16(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "relu" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.rand(3, 2).astype(self.dtype) + out = x + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestReluNeg(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "relu" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.array([0.1, -0.1, -1.0]).astype(self.dtype) + out = np.array([0.1, 0.0, 0.0]).astype(self.dtype) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestReluNet(unittest.TestCase): + def _test(self, run_mlu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.nn.functional.relu(sum) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_mlu: + place = paddle.MLUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_mlu(self): + cpu_pred, cpu_loss = self._test(False) + mlu_pred, mlu_loss = self._test(True) + + self.assertTrue(np.allclose(mlu_pred, cpu_pred)) + self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index ec59c27558332..01d851469a8d1 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -326,6 +326,9 @@ def is_rocm_op_test(): def is_npu_op_test(): return hasattr(cls, "use_npu") and cls.use_npu == True + def is_mlu_op_test(): + return hasattr(cls, "use_mlu") and cls.use_mlu == True + if not hasattr(cls, "op_type"): raise AssertionError( "This test do not have op_type in class attrs, " @@ -348,7 +351,8 @@ def is_npu_op_test(): and not is_xpu_op_test() \ and not is_mkldnn_op_test() \ and not is_rocm_op_test() \ - and not is_npu_op_test(): + and not is_npu_op_test() \ + and not is_mlu_op_test(): raise AssertionError( "This test of %s op needs check_grad with fp64 precision." % cls.op_type) @@ -1297,7 +1301,8 @@ def find_actual(target_name, fetch_list): # No effect on original OpTest # Currently not support ParallelExecutor on XPUPlace. if not paddle.is_compiled_with_xpu( - ) and not paddle.is_compiled_with_npu(): + ) and not paddle.is_compiled_with_npu( + ) and not paddle.is_compiled_with_mlu(): self.check_inplace_output_with_place( place, no_check_set=no_check_set, inplace_atol=inplace_atol) @@ -1547,11 +1552,9 @@ def check_grad_with_place(self, delta=numeric_grad_delta, in_place=in_place) for input_to_check in inputs_to_check ] - analytic_grads = self._get_gradient(inputs_to_check, place, output_names, no_grad_set, user_defined_grad_outputs) - # comparison of bf16 results will happen as fp32 # loop over list of grads and convert bf16 to fp32 fp32_analytic_grads = [] From 556d509791b2b0a6c12781f7ecb6bbf811ee3bec Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 14 Jan 2022 11:47:16 +0800 Subject: [PATCH 19/24] refactor impl of elementwise op part2 (#38898) --- .../elementwise/elementwise_op_function.h | 621 +------------- paddle/pten/kernels/cpu/elementwise.h | 144 ++++ paddle/pten/kernels/gpu/elementwise.h | 768 ++++++++++++++++++ 3 files changed, 919 insertions(+), 614 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 626046890fb06..7cd04318d3f49 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -49,12 +49,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/for_range.h" -#define GetDivMod(dividend, divisor, div, mod) \ - do { \ - const auto dividend_copy = dividend; \ - *div = dividend_copy / divisor; \ - *mod = dividend_copy % divisor; \ - } while (0) #define DIVUP(x, y) (((x) + (y)-1) / (y)) @@ -138,613 +132,11 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, axis); } -template -void CommonForwardBroadcastCPU(const framework::Tensor *x, - const framework::Tensor *y, framework::Tensor *z, - int *x_dims_array, int *y_dims_array, - int *out_dims_array, int max_dim, - const platform::CPUDeviceContext &ctx, - Functor func, - const bool is_xsize_larger = true) { - pten::CommonForwardBroadcastCPU(x, y, z, x_dims_array, y_dims_array, - out_dims_array, max_dim, ctx, func, - is_xsize_larger); -} - -#if defined(__NVCC__) || defined(__HIPCC__) - -template -__global__ void CommonGradBroadcastCUDAKernel( - const int *x_strides_array, const int *y_strides_array, - const int *out_dims_array, const int *y_strides_order, - const int *y_dims_order, const T *x, const T *y, const Tout *out, - const Tout *dout, T *dx, int out_size, int max_dim, int thread_num, - DX_OP dx_op) { - T val(0); - int i = blockIdx.x; - int tid = threadIdx.x; - for (int j = tid; j < thread_num; j += blockDim.x) { - const int X_index = i * thread_num + j; - int out_index = X_index; - int C_index = 0; - int B_index = i * thread_num + j; - int remainder = 0; -#pragma unroll - for (int d = max_dim - 1; d >= 0; --d) { - GetDivMod(B_index, y_dims_order[d], &B_index, &remainder); - C_index += remainder * y_strides_order[d]; - } - int x_index = 0; - int y_index = 0; - int C_index_val = C_index; -#pragma unroll - for (int d = max_dim - 1; d >= 0; --d) { - GetDivMod(C_index_val, out_dims_array[d], &C_index_val, &remainder); - x_index += remainder * x_strides_array[d]; - y_index += remainder * y_strides_array[d]; - } - out_index = C_index; - val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]); - } - val = paddle::platform::reduceSum(val, tid, thread_num); - if (threadIdx.x == 0) { - dx[i] = val; - } -} - -template -void CommonGradBroadcastCUDA( - const framework::Tensor &x, const framework::Tensor &y, - const framework::Tensor &out, const framework::Tensor &dout, - framework::Tensor *dx, framework::Tensor *dy, int *x_dims_array, - int *y_dims_array, int *out_dims_array, int max_dim, - const platform::CUDADeviceContext &ctx, DX_OP dx_op, DY_OP dy_op) { - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - auto cplace = platform::CPUPlace(); - const T *x_data = x.data(); - const T *y_data = y.data(); - const Tout *out_data = out.data(); - const Tout *dout_data = dout.data(); - T *dx_data = dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()); - T *dy_data = dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()); - - std::vector x_one_indexs; - std::vector y_one_indexs; - for (int i = 0; i < max_dim; i++) { - if (x_dims_array[i] != y_dims_array[i]) { - if (x_dims_array[i] == 1) { - x_one_indexs.push_back(i); - } - if (y_dims_array[i] == 1) { - y_one_indexs.push_back(i); - } - } - } - - std::vector x_trans_indexs(max_dim); - std::vector y_trans_indexs(max_dim); - pten::ComputeBroadcastTranspositionArray( - x_one_indexs.data(), x_trans_indexs.data(), max_dim, x_one_indexs.size()); - pten::ComputeBroadcastTranspositionArray( - y_one_indexs.data(), y_trans_indexs.data(), max_dim, y_one_indexs.size()); - - // compute array stride for cuda kernel; - // e.g. x.dims=[2,3,4], x_stride=[12,4,1] - std::vector x_strides_array(max_dim); - std::vector y_strides_array(max_dim); - std::vector out_strides_array(max_dim); - int x_stride = 1; - int y_stride = 1; - int z_stride = 1; - for (int i = max_dim - 1; i >= 0; i--) { - x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride; - y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride; - out_strides_array[i] = z_stride; - x_stride *= x_dims_array[i]; - y_stride *= y_dims_array[i]; - z_stride *= out_dims_array[i]; - } - - std::vector x_strides_order(max_dim); - std::vector y_strides_order(max_dim); - std::vector x_dims_order(max_dim); - std::vector y_dims_order(max_dim); - for (int i = 0; i < max_dim; ++i) { - x_strides_order[i] = out_strides_array[x_trans_indexs[i]]; - y_strides_order[i] = out_strides_array[y_trans_indexs[i]]; - x_dims_order[i] = out_dims_array[x_trans_indexs[i]]; - y_dims_order[i] = out_dims_array[y_trans_indexs[i]]; - } - std::vector x_broadcast_pos; - std::vector y_broadcast_pos; - - int bytes = max_dim * sizeof(int); - - for (int i = 0; i < max_dim; ++i) { - if (x_dims_array[i] != out_dims_array[i] && x_dims_array[i] == 1) { - x_broadcast_pos.emplace_back(i); - } - if (y_dims_array[i] != out_dims_array[i] && y_dims_array[i] == 1) { - y_broadcast_pos.emplace_back(i); - } - } - - auto stream = ctx.stream(); - bool can_split_x = false; - bool can_split_y = false; - - auto FastCommonCUDAF = [&](const std::vector &broadcast_pos, bool is_y) { - int h = - std::accumulate(out_dims_array, out_dims_array + broadcast_pos.size(), - 1, std::multiplies()); - int w = - std::accumulate(out_dims_array + broadcast_pos.size(), - out_dims_array + max_dim, 1, std::multiplies()); - - VLOG(3) << "FastCommonCUDAF elementwise w:" << w << " h:" << h - << " is_y:" << is_y; - - int split_h; - int split_w; - int kh = h; - int kw = w; - - if (is_y) { - split_h = - std::accumulate(x_dims_array, x_dims_array + broadcast_pos.size(), 1, - std::multiplies()); - split_w = - std::accumulate(x_dims_array + broadcast_pos.size(), - x_dims_array + max_dim, 1, std::multiplies()); - - } else { - split_h = - std::accumulate(y_dims_array, y_dims_array + broadcast_pos.size(), 1, - std::multiplies()); - split_w = - std::accumulate(y_dims_array + broadcast_pos.size(), - y_dims_array + max_dim, 1, std::multiplies()); - } - - if (h > split_h) kh = split_h; - if (w > split_w) kw = split_w; - - if (is_y) { - if (w < 16 || h < 16) { - int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); - int grid_size = w; - pten::CommonGradBroadcast1CUDAKernelHeight<<>>( - x_data, y_data, out_data, dout_data, h, w, dy_op, dy_data, kh, kw, - is_y); - } else { - dim3 block_size = dim3(BLOCK_X, BLOCK_Y); - int grid_size = (w + BLOCK_X - 1) / BLOCK_X; - pten::FastCommonGradBroadcastCUDAKernelHeight<<>>( - x_data, y_data, out_data, dout_data, h, w, dy_op, dy_data, kh, kw, - is_y); - } - } else { - if (w < 16 || h < 16) { - int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); - int grid_size = w; - pten::CommonGradBroadcast1CUDAKernelHeight<<>>( - x_data, y_data, out_data, dout_data, h, w, dx_op, dx_data, kh, kw, - is_y); - } else { - dim3 block_size = dim3(BLOCK_X, BLOCK_Y); - int grid_size = (w + BLOCK_X - 1) / BLOCK_X; - pten::FastCommonGradBroadcastCUDAKernelHeight<<>>( - x_data, y_data, out_data, dout_data, h, w, dx_op, dx_data, kh, kw, - is_y); - } - } - }; - - auto FastBroadCastHeightCUDAF = [&](const std::vector &broadcast_pos, - bool x_large) { - int h = - std::accumulate(out_dims_array, out_dims_array + broadcast_pos.size(), - 1, std::multiplies()); - int w = - std::accumulate(out_dims_array + broadcast_pos.size(), - out_dims_array + max_dim, 1, std::multiplies()); - - VLOG(3) << "FastBroadCastHeightCUDAF w:" << w << " h:" << h; - - if (w < 16 || h < 16) { - int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); - int grid_size = w; - pten::ElemwiseGradBroadcast1CUDAKernel<<>>( - x_data, y_data, out_data, dout_data, h, w, x_large, dx_op, dy_op, - dx_data, dy_data); - } else { - dim3 block_size = dim3(BLOCK_X, BLOCK_Y); - int grid_size = (w + BLOCK_X - 1) / BLOCK_X; - pten::FastElemwiseGradBroadcast1CUDAKernel<<>>( - x_data, y_data, out_data, dout_data, h, w, x_large, dx_op, dy_op, - dx_data, dy_data); - } - }; - - auto FastBroadCastAllCUDAF = [&](const std::vector &broadcast_pos, - int max_dim, bool is_x_large) { - int axis = broadcast_pos[0]; - int pre = std::accumulate(out_dims_array, out_dims_array + axis, 1, - std::multiplies()); - int mid = 1; - int post = 1; - - if (broadcast_pos.size() == 1) { - mid = out_dims_array[axis]; - post = - std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim, - 1, std::multiplies()); - } else { - mid = std::accumulate(out_dims_array + axis, - out_dims_array + broadcast_pos.back() + 1, 1, - std::multiplies()); - post = - std::accumulate(out_dims_array + broadcast_pos.back() + 1, - out_dims_array + max_dim, 1, std::multiplies()); - } - - VLOG(3) << "FastBroadCastAllCUDAF pre:" << pre << " mid:" << mid - << " post:" << post; - - int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid); - int grid_size = pre * post; - - pten::FastCommonGradBroadcastAllCUDAKernel<<>>( - x_data, y_data, out_data, dout_data, pre, mid, post, is_x_large, dx_op, - dy_op, dx_data, dy_data); - }; - - auto FastBroadCastOneCUDAF = [&](const std::vector &broadcast_pos, - int max_dim, bool is_x) { - int axis = broadcast_pos[0]; - int pre = std::accumulate(out_dims_array, out_dims_array + axis, 1, - std::multiplies()); - int mid = out_dims_array[axis]; - int post = - std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim, 1, - std::multiplies()); - - int k_pre; - int k_mid; - int k_post; - - if (is_x) { - k_pre = std::accumulate(y_dims_array, y_dims_array + axis, 1, - std::multiplies()); - k_mid = y_dims_array[axis]; - k_post = std::accumulate(y_dims_array + axis + 1, y_dims_array + max_dim, - 1, std::multiplies()); - int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid); - int grid_size = pre * post; - // we need to calc y offset with blockid, so do x_pre/y_pre to get left - // size. - if (k_pre != pre) k_pre = pre / k_pre; - - pten::FastCommonGradBroadcastOneCUDAKernel<<>>( - x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid, - k_post, true, dx_op, dx_data); - } else { - k_pre = std::accumulate(x_dims_array, x_dims_array + axis, 1, - std::multiplies()); - k_mid = x_dims_array[axis]; - k_post = std::accumulate(x_dims_array + axis + 1, x_dims_array + max_dim, - 1, std::multiplies()); - int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid); - int grid_size = pre * post; - if (k_pre != pre) k_pre = pre / k_pre; - - pten::FastCommonGradBroadcastOneCUDAKernel<<>>( - x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid, - k_post, false, dy_op, dy_data); - } - VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid - << " post:" << post; - }; - - // do fast elementwise if: 1. only one input need to do broadcast, we can - // fallback - // to old fast path. - // 2. if both x and y need broadcast, then do it one by one. - bool fast_broadcast = false; - if (x_broadcast_pos.empty() && !y_broadcast_pos.empty()) { - can_split_y = pten::SplitDims(y_broadcast_pos, max_dim); - if (can_split_y) { - // only y need to do broadcast on h - if (y_broadcast_pos[0] == 0) { - FastBroadCastHeightCUDAF(y_broadcast_pos, true); - fast_broadcast = true; - } - } else if (y_broadcast_pos.size() == 1 || - pten::CheckContiguousDims( - y_broadcast_pos)) { // for only one dim and - // contiguous broadcast. - // If cannot split, which means input has 3 parts - FastBroadCastAllCUDAF(y_broadcast_pos, max_dim, true); - fast_broadcast = true; - } - } else if (y_broadcast_pos.empty() && !x_broadcast_pos.empty()) { - // only x need broadcast - can_split_x = pten::SplitDims(x_broadcast_pos, max_dim); - if (can_split_x) { - if (x_broadcast_pos[0] == 0) { - FastBroadCastHeightCUDAF(x_broadcast_pos, false); - fast_broadcast = true; - } - } else if (x_broadcast_pos.size() == 1 || - pten::CheckContiguousDims(x_broadcast_pos)) { - FastBroadCastAllCUDAF(x_broadcast_pos, max_dim, false); - fast_broadcast = true; - } - } else if (!x_broadcast_pos.empty() && !y_broadcast_pos.empty()) { - // do x and y broadcast each. - can_split_y = pten::SplitDims(y_broadcast_pos, max_dim); - bool fast_broadcast_x = false; - bool fast_broadcast_y = false; - if (can_split_y) { - // begin at start. - if (y_broadcast_pos[0] == 0) { - FastCommonCUDAF(y_broadcast_pos, true); - fast_broadcast_y = true; - } - } else if (y_broadcast_pos.size() == 1) { - FastBroadCastOneCUDAF(y_broadcast_pos, max_dim, false); - can_split_y = true; - fast_broadcast_y = true; - } - can_split_x = pten::SplitDims(x_broadcast_pos, max_dim); - if (can_split_x) { - if (x_broadcast_pos[0] == 0) { - FastCommonCUDAF(x_broadcast_pos, false); - fast_broadcast_x = true; - } - } else if (x_broadcast_pos.size() == 1) { - FastBroadCastOneCUDAF(x_broadcast_pos, max_dim, true); - can_split_x = true; - fast_broadcast_x = true; - } - VLOG(3) << "CommonBroadcast can_split_y:" << can_split_y - << " can_split_x:" << can_split_x; - // if both x and y into fast path then return - if (fast_broadcast_x && fast_broadcast_y) { - fast_broadcast = true; - } - if (can_split_y && can_split_x && fast_broadcast) return; - } - - // Should remove memory copy, use reg instead. - if (fast_broadcast) { - return; - } - int x_blocks = 0; - int x_threads = 0; - pten::ComputeBroadcastKernelSize(x_dims_array, out_dims_array, &x_blocks, - &x_threads, max_dim); - int y_blocks = 0; - int y_threads = 0; - pten::ComputeBroadcastKernelSize(y_dims_array, out_dims_array, &y_blocks, - &y_threads, max_dim); - - auto x_strides_array_tmp = memory::Alloc(ctx, bytes); - int *x_strides_array_gpu = - reinterpret_cast(x_strides_array_tmp->ptr()); - memory::Copy(gplace, x_strides_array_gpu, cplace, x_strides_array.data(), - bytes, ctx.stream()); - - auto y_strides_array_tmp = memory::Alloc(ctx, bytes); - int *y_strides_array_gpu = - reinterpret_cast(y_strides_array_tmp->ptr()); - memory::Copy(gplace, y_strides_array_gpu, cplace, y_strides_array.data(), - bytes, ctx.stream()); - - auto out_dims_array_tmp = memory::Alloc(ctx, bytes); - int *out_dims_array_gpu = reinterpret_cast(out_dims_array_tmp->ptr()); - memory::Copy(gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, - ctx.stream()); - - const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim, - 1, std::multiplies()); - int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads); - int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads); - if (dx) { - auto x_strides_order_tmp = memory::Alloc(ctx, bytes); - int *x_strides_order_gpu = - reinterpret_cast(x_strides_order_tmp->ptr()); - memory::Copy(gplace, x_strides_order_gpu, cplace, x_strides_order.data(), - bytes, ctx.stream()); - - auto x_dims_order_tmp = memory::Alloc(ctx, bytes); - int *x_dims_order_gpu = reinterpret_cast(x_dims_order_tmp->ptr()); - memory::Copy(gplace, x_dims_order_gpu, cplace, x_dims_order.data(), bytes, - ctx.stream()); - CommonGradBroadcastCUDAKernel< - T, DX_OP, Tout><<>>( - x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu, - x_strides_order_gpu, x_dims_order_gpu, x_data, y_data, out_data, - dout_data, dx_data, out_size, max_dim, x_threads, dx_op); - } - if (dy) { - auto y_strides_order_tmp = memory::Alloc(ctx, bytes); - int *y_strides_order_gpu = - reinterpret_cast(y_strides_order_tmp->ptr()); - memory::Copy(gplace, y_strides_order_gpu, cplace, y_strides_order.data(), - bytes, ctx.stream()); - - auto y_dims_order_tmp = memory::Alloc(ctx, bytes); - int *y_dims_order_gpu = reinterpret_cast(y_dims_order_tmp->ptr()); - memory::Copy(gplace, y_dims_order_gpu, cplace, y_dims_order.data(), bytes, - ctx.stream()); - CommonGradBroadcastCUDAKernel< - T, DY_OP, Tout><<>>( - x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu, - y_strides_order_gpu, y_dims_order_gpu, x_data, y_data, out_data, - dout_data, dy_data, out_size, max_dim, y_threads, dy_op); - } -} - -#endif // __NVCC__ or __HIPCC__ - inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { return pten::funcs::trim_trailing_singular_dims(dims); } -template -void CommonElementwiseBroadcastBackward( - const framework::ExecutionContext &ctx, const framework::DDim &x_dims, - const framework::DDim &y_dims, const framework::Tensor &x, - const framework::Tensor &y, const framework::Tensor &out, - const framework::Tensor &dout, int axis, framework::Tensor *dx, - framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - int max_dim = std::max(x_dims.size(), y_dims.size()); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), max_dim, - axis); - // for inplace strategy. memset will make dx and dout clear and get wrong - // result. - if (dx && dx->IsSharedBufferWith(dout)) { - dx->clear(); - dx->mutable_data(x_dims, ctx.GetPlace()); - } - - VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" - << framework::make_ddim(x_dims_array) - << " ydim:" << framework::make_ddim(y_dims_array); - - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) - CommonGradBroadcastCUDA( - x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(), - out_dims_array.data(), max_dim, - ctx.template device_context(), dx_op, - dy_op); -#endif - } else { - pten::CommonGradBroadcastCPU( - x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(), - out_dims_array.data(), max_dim, - ctx.template device_context(), dx_op, - dy_op); - } -} - -template -void ElemwiseGradComputeWithBroadcast( - const framework::ExecutionContext &ctx, const framework::DDim &x_dims, - const framework::DDim &y_dims, const framework::Tensor &x, - const framework::Tensor &y, const framework::Tensor &out, - const framework::Tensor &dout, int axis, framework::Tensor *dx, - framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - bool is_xsize_larger = true; - - int max_dim = x_dims.size(); - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, 0, - platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, max_dim, - platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, axis)); - - int pre, n, post, is_run_common_broadcast, axis_trim = 0; - if (is_xsize_larger) { - auto y_dims_trimed = trim_trailing_singular_dims(y_dims); - axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - pten::funcs::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post, - &is_run_common_broadcast); - } else { - auto x_dims_trimed = trim_trailing_singular_dims(x_dims); - axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - pten::funcs::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post, - &is_run_common_broadcast); - } - // special case for common backward implementation. - if (is_run_common_broadcast) { - CommonElementwiseBroadcastBackward( - ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - return; - } - if (post == 1) { - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) - pten::ElemwiseGradBroadcast1CUDA( - ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), pre, n, - is_xsize_larger, dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); -#endif - } else { - pten::ElemwiseGradBroadcast1CPU( - x.data(), y.data(), out.data(), dout.data(), pre, n, - is_xsize_larger, dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); - } - } else { - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) - pten::ElemwiseGradBroadcast2CUDA( - ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), pre, n, post, - is_xsize_larger, dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); -#endif - } else { - pten::ElemwiseGradBroadcast2CPU( - x.data(), y.data(), out.data(), dout.data(), pre, n, - post, is_xsize_larger, dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); - } - } -} - -template -void CommonElementwiseBroadcastForward( - const framework::ExecutionContext &ctx, const framework::Tensor *x, - const framework::Tensor *y, framework::Tensor *z, - const framework::DDim &x_dims, const framework::DDim &y_dims, Functor func, - int axis, const bool is_xsize_larger = true) { - z->mutable_data(ctx.GetPlace()); - auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); - auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); - auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); - const auto &dev_ctx = ctx.template device_context(); - pten::CommonElementwiseBroadcastForward(dev_ctx, *pt_x.get(), *pt_y.get(), - pt_z.get(), x_dims, y_dims, func, - axis, is_xsize_larger); -} - template void ElemwiseGradCompute(const framework::ExecutionContext &ctx, @@ -755,14 +147,14 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx, DX_OP dx_op, DY_OP dy_op) { const framework::DDim &x_dim = x.dims(); const framework::DDim &y_dim = y.dims(); + const auto &dev_ctx = ctx.template device_context(); if (x.dims() == y.dims()) { - const auto &dev_ctx = ctx.template device_context(); pten::funcs::ElemwiseGradComputeNoBroadcast( dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { - ElemwiseGradComputeWithBroadcast( - ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + pten::ElemwiseGradComputeWithBroadcast( + dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } } @@ -780,14 +172,15 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx, DX_OP dx_op, DY_OP dy_op) { const framework::DDim &x_dim = x.dims(); const framework::DDim &y_dim = y.dims(); + const auto &dev_ctx = ctx.template device_context(); if (x.dims() == y.dims()) { - const auto &dev_ctx = ctx.template device_context(); pten::funcs::ElemwiseGradComputeNoBroadcast( dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, dy_op); } else { - ElemwiseGradComputeWithBroadcast( - ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, dy_op); + pten::ElemwiseGradComputeWithBroadcast( + dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, + dy_op); } } diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h index 97db997a16478..b448586754d60 100644 --- a/paddle/pten/kernels/cpu/elementwise.h +++ b/paddle/pten/kernels/cpu/elementwise.h @@ -549,4 +549,148 @@ static void ElemwiseGradBroadcast2CPU(const T* x, } } +template +void CommonElementwiseBroadcastBackward(const CPUContext& ctx, + const DDim& x_dims, + const DDim& y_dims, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy, + DX_OP dx_op, + DY_OP dy_op) { + int max_dim = std::max(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + // for inplace strategy. memset will make dx and dout clear and get wrong + // result. + if (dx && dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x_dims, ctx.GetPlace()); + } + + VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" + << paddle::framework::make_ddim(x_dims_array) + << " ydim:" << paddle::framework::make_ddim(y_dims_array); + + CommonGradBroadcastCPU(x, + y, + out, + dout, + dx, + dy, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + ctx, + dx_op, + dy_op); +} + +template +void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx, + const DDim& x_dims, + const DDim& y_dims, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy, + DX_OP dx_op, + DY_OP dy_op) { + bool is_xsize_larger = true; + + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + funcs::get_mid_dims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + funcs::get_mid_dims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common backward implementation. + if (is_run_common_broadcast) { + CommonElementwiseBroadcastBackward( + ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + return; + } + if (post == 1) { + ElemwiseGradBroadcast1CPU( + x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } else { + ElemwiseGradBroadcast2CPU( + x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + post, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } +} + } // namespace pten diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index 4dfcd7a2152e0..5abc40c75d17f 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -18,7 +18,10 @@ limitations under the License. */ #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/function_traits.h" +#include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/funcs/cuda_kernel_config.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" #ifdef __HIPCC__ constexpr int ELEMWISE_MAX_BLOCK_DIM = 256; @@ -28,6 +31,13 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #define BLOCK_X 32 #define BLOCK_Y 32 +#define GetDivMod(dividend, divisor, div, mod) \ + do { \ + const auto dividend_copy = dividend; \ + *div = dividend_copy / divisor; \ + *mod = dividend_copy % divisor; \ + } while (0) + namespace pten { namespace kps = paddle::operators::kernel_primitives; @@ -1469,4 +1479,762 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream, x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy); } +template +__global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array, + const int *y_strides_array, + const int *out_dims_array, + const int *y_strides_order, + const int *y_dims_order, + const T *x, + const T *y, + const Tout *out, + const Tout *dout, + T *dx, + int out_size, + int max_dim, + int thread_num, + DX_OP dx_op) { + T val(0); + int i = blockIdx.x; + int tid = threadIdx.x; + for (int j = tid; j < thread_num; j += blockDim.x) { + const int X_index = i * thread_num + j; + int out_index = X_index; + int C_index = 0; + int B_index = i * thread_num + j; + int remainder = 0; +#pragma unroll + for (int d = max_dim - 1; d >= 0; --d) { + GetDivMod(B_index, y_dims_order[d], &B_index, &remainder); + C_index += remainder * y_strides_order[d]; + } + int x_index = 0; + int y_index = 0; + int C_index_val = C_index; +#pragma unroll + for (int d = max_dim - 1; d >= 0; --d) { + GetDivMod(C_index_val, out_dims_array[d], &C_index_val, &remainder); + x_index += remainder * x_strides_array[d]; + y_index += remainder * y_strides_array[d]; + } + out_index = C_index; + val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]); + } + val = paddle::platform::reduceSum(val, tid, thread_num); + if (threadIdx.x == 0) { + dx[i] = val; + } +} + +template +void CommonGradBroadcastCUDA(const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int *x_dims_array, + int *y_dims_array, + int *out_dims_array, + int max_dim, + const GPUContext &ctx, + DX_OP dx_op, + DY_OP dy_op) { + const auto gplace = + BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx.GetPlace()); + auto cplace = paddle::platform::CPUPlace(); + const T *x_data = x.data(); + const T *y_data = y.data(); + const Tout *out_data = out.data(); + const Tout *dout_data = dout.data(); + T *dx_data = dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()); + T *dy_data = dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()); + + std::vector x_one_indexs; + std::vector y_one_indexs; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] != y_dims_array[i]) { + if (x_dims_array[i] == 1) { + x_one_indexs.push_back(i); + } + if (y_dims_array[i] == 1) { + y_one_indexs.push_back(i); + } + } + } + + std::vector x_trans_indexs(max_dim); + std::vector y_trans_indexs(max_dim); + ComputeBroadcastTranspositionArray( + x_one_indexs.data(), x_trans_indexs.data(), max_dim, x_one_indexs.size()); + ComputeBroadcastTranspositionArray( + y_one_indexs.data(), y_trans_indexs.data(), max_dim, y_one_indexs.size()); + + // compute array stride for cuda kernel; + // e.g. x.dims=[2,3,4], x_stride=[12,4,1] + std::vector x_strides_array(max_dim); + std::vector y_strides_array(max_dim); + std::vector out_strides_array(max_dim); + int x_stride = 1; + int y_stride = 1; + int z_stride = 1; + for (int i = max_dim - 1; i >= 0; i--) { + x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride; + y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride; + out_strides_array[i] = z_stride; + x_stride *= x_dims_array[i]; + y_stride *= y_dims_array[i]; + z_stride *= out_dims_array[i]; + } + + std::vector x_strides_order(max_dim); + std::vector y_strides_order(max_dim); + std::vector x_dims_order(max_dim); + std::vector y_dims_order(max_dim); + for (int i = 0; i < max_dim; ++i) { + x_strides_order[i] = out_strides_array[x_trans_indexs[i]]; + y_strides_order[i] = out_strides_array[y_trans_indexs[i]]; + x_dims_order[i] = out_dims_array[x_trans_indexs[i]]; + y_dims_order[i] = out_dims_array[y_trans_indexs[i]]; + } + std::vector x_broadcast_pos; + std::vector y_broadcast_pos; + + int bytes = max_dim * sizeof(int); + + for (int i = 0; i < max_dim; ++i) { + if (x_dims_array[i] != out_dims_array[i] && x_dims_array[i] == 1) { + x_broadcast_pos.emplace_back(i); + } + if (y_dims_array[i] != out_dims_array[i] && y_dims_array[i] == 1) { + y_broadcast_pos.emplace_back(i); + } + } + + auto stream = ctx.stream(); + bool can_split_x = false; + bool can_split_y = false; + + auto FastCommonCUDAF = [&](const std::vector &broadcast_pos, bool is_y) { + int h = std::accumulate(out_dims_array, + out_dims_array + broadcast_pos.size(), + 1, + std::multiplies()); + int w = std::accumulate(out_dims_array + broadcast_pos.size(), + out_dims_array + max_dim, + 1, + std::multiplies()); + + VLOG(3) << "FastCommonCUDAF elementwise w:" << w << " h:" << h + << " is_y:" << is_y; + + int split_h; + int split_w; + int kh = h; + int kw = w; + + if (is_y) { + split_h = std::accumulate(x_dims_array, + x_dims_array + broadcast_pos.size(), + 1, + std::multiplies()); + split_w = std::accumulate(x_dims_array + broadcast_pos.size(), + x_dims_array + max_dim, + 1, + std::multiplies()); + + } else { + split_h = std::accumulate(y_dims_array, + y_dims_array + broadcast_pos.size(), + 1, + std::multiplies()); + split_w = std::accumulate(y_dims_array + broadcast_pos.size(), + y_dims_array + max_dim, + 1, + std::multiplies()); + } + + if (h > split_h) kh = split_h; + if (w > split_w) kw = split_w; + + if (is_y) { + if (w < 16 || h < 16) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); + int grid_size = w; + CommonGradBroadcast1CUDAKernelHeight<<>>(x_data, + y_data, + out_data, + dout_data, + h, + w, + dy_op, + dy_data, + kh, + kw, + is_y); + } else { + dim3 block_size = dim3(BLOCK_X, BLOCK_Y); + int grid_size = (w + BLOCK_X - 1) / BLOCK_X; + FastCommonGradBroadcastCUDAKernelHeight<<>>(x_data, + y_data, + out_data, + dout_data, + h, + w, + dy_op, + dy_data, + kh, + kw, + is_y); + } + } else { + if (w < 16 || h < 16) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); + int grid_size = w; + CommonGradBroadcast1CUDAKernelHeight<<>>(x_data, + y_data, + out_data, + dout_data, + h, + w, + dx_op, + dx_data, + kh, + kw, + is_y); + } else { + dim3 block_size = dim3(BLOCK_X, BLOCK_Y); + int grid_size = (w + BLOCK_X - 1) / BLOCK_X; + FastCommonGradBroadcastCUDAKernelHeight<<>>(x_data, + y_data, + out_data, + dout_data, + h, + w, + dx_op, + dx_data, + kh, + kw, + is_y); + } + } + }; + + auto FastBroadCastHeightCUDAF = [&](const std::vector &broadcast_pos, + bool x_large) { + int h = std::accumulate(out_dims_array, + out_dims_array + broadcast_pos.size(), + 1, + std::multiplies()); + int w = std::accumulate(out_dims_array + broadcast_pos.size(), + out_dims_array + max_dim, + 1, + std::multiplies()); + + VLOG(3) << "FastBroadCastHeightCUDAF w:" << w << " h:" << h; + + if (w < 16 || h < 16) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); + int grid_size = w; + ElemwiseGradBroadcast1CUDAKernel<<>>( + x_data, + y_data, + out_data, + dout_data, + h, + w, + x_large, + dx_op, + dy_op, + dx_data, + dy_data); + } else { + dim3 block_size = dim3(BLOCK_X, BLOCK_Y); + int grid_size = (w + BLOCK_X - 1) / BLOCK_X; + FastElemwiseGradBroadcast1CUDAKernel<<>>(x_data, + y_data, + out_data, + dout_data, + h, + w, + x_large, + dx_op, + dy_op, + dx_data, + dy_data); + } + }; + + auto FastBroadCastAllCUDAF = [&]( + const std::vector &broadcast_pos, int max_dim, bool is_x_large) { + int axis = broadcast_pos[0]; + int pre = std::accumulate( + out_dims_array, out_dims_array + axis, 1, std::multiplies()); + int mid = 1; + int post = 1; + + if (broadcast_pos.size() == 1) { + mid = out_dims_array[axis]; + post = std::accumulate(out_dims_array + axis + 1, + out_dims_array + max_dim, + 1, + std::multiplies()); + } else { + mid = std::accumulate(out_dims_array + axis, + out_dims_array + broadcast_pos.back() + 1, + 1, + std::multiplies()); + post = std::accumulate(out_dims_array + broadcast_pos.back() + 1, + out_dims_array + max_dim, + 1, + std::multiplies()); + } + + VLOG(3) << "FastBroadCastAllCUDAF pre:" << pre << " mid:" << mid + << " post:" << post; + + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid); + int grid_size = pre * post; + + FastCommonGradBroadcastAllCUDAKernel<<>>( + x_data, + y_data, + out_data, + dout_data, + pre, + mid, + post, + is_x_large, + dx_op, + dy_op, + dx_data, + dy_data); + }; + + auto FastBroadCastOneCUDAF = [&]( + const std::vector &broadcast_pos, int max_dim, bool is_x) { + int axis = broadcast_pos[0]; + int pre = std::accumulate( + out_dims_array, out_dims_array + axis, 1, std::multiplies()); + int mid = out_dims_array[axis]; + int post = std::accumulate(out_dims_array + axis + 1, + out_dims_array + max_dim, + 1, + std::multiplies()); + + int k_pre; + int k_mid; + int k_post; + + if (is_x) { + k_pre = std::accumulate( + y_dims_array, y_dims_array + axis, 1, std::multiplies()); + k_mid = y_dims_array[axis]; + k_post = std::accumulate(y_dims_array + axis + 1, + y_dims_array + max_dim, + 1, + std::multiplies()); + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid); + int grid_size = pre * post; + // we need to calc y offset with blockid, so do x_pre/y_pre to get left + // size. + if (k_pre != pre) k_pre = pre / k_pre; + + FastCommonGradBroadcastOneCUDAKernel<<>>(x_data, + y_data, + out_data, + dout_data, + pre, + mid, + post, + k_pre, + k_mid, + k_post, + true, + dx_op, + dx_data); + } else { + k_pre = std::accumulate( + x_dims_array, x_dims_array + axis, 1, std::multiplies()); + k_mid = x_dims_array[axis]; + k_post = std::accumulate(x_dims_array + axis + 1, + x_dims_array + max_dim, + 1, + std::multiplies()); + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid); + int grid_size = pre * post; + if (k_pre != pre) k_pre = pre / k_pre; + + FastCommonGradBroadcastOneCUDAKernel<<>>(x_data, + y_data, + out_data, + dout_data, + pre, + mid, + post, + k_pre, + k_mid, + k_post, + false, + dy_op, + dy_data); + } + VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid + << " post:" << post; + }; + + // do fast elementwise if: 1. only one input need to do broadcast, we can + // fallback + // to old fast path. + // 2. if both x and y need broadcast, then do it one by one. + bool fast_broadcast = false; + if (x_broadcast_pos.empty() && !y_broadcast_pos.empty()) { + can_split_y = SplitDims(y_broadcast_pos, max_dim); + if (can_split_y) { + // only y need to do broadcast on h + if (y_broadcast_pos[0] == 0) { + FastBroadCastHeightCUDAF(y_broadcast_pos, true); + fast_broadcast = true; + } + } else if (y_broadcast_pos.size() == 1 || + CheckContiguousDims(y_broadcast_pos)) { // for only one dim and + // contiguous broadcast. + // If cannot split, which means input has 3 parts + FastBroadCastAllCUDAF(y_broadcast_pos, max_dim, true); + fast_broadcast = true; + } + } else if (y_broadcast_pos.empty() && !x_broadcast_pos.empty()) { + // only x need broadcast + can_split_x = SplitDims(x_broadcast_pos, max_dim); + if (can_split_x) { + if (x_broadcast_pos[0] == 0) { + FastBroadCastHeightCUDAF(x_broadcast_pos, false); + fast_broadcast = true; + } + } else if (x_broadcast_pos.size() == 1 || + CheckContiguousDims(x_broadcast_pos)) { + FastBroadCastAllCUDAF(x_broadcast_pos, max_dim, false); + fast_broadcast = true; + } + } else if (!x_broadcast_pos.empty() && !y_broadcast_pos.empty()) { + // do x and y broadcast each. + can_split_y = SplitDims(y_broadcast_pos, max_dim); + bool fast_broadcast_x = false; + bool fast_broadcast_y = false; + if (can_split_y) { + // begin at start. + if (y_broadcast_pos[0] == 0) { + FastCommonCUDAF(y_broadcast_pos, true); + fast_broadcast_y = true; + } + } else if (y_broadcast_pos.size() == 1) { + FastBroadCastOneCUDAF(y_broadcast_pos, max_dim, false); + can_split_y = true; + fast_broadcast_y = true; + } + can_split_x = SplitDims(x_broadcast_pos, max_dim); + if (can_split_x) { + if (x_broadcast_pos[0] == 0) { + FastCommonCUDAF(x_broadcast_pos, false); + fast_broadcast_x = true; + } + } else if (x_broadcast_pos.size() == 1) { + FastBroadCastOneCUDAF(x_broadcast_pos, max_dim, true); + can_split_x = true; + fast_broadcast_x = true; + } + VLOG(3) << "CommonBroadcast can_split_y:" << can_split_y + << " can_split_x:" << can_split_x; + // if both x and y into fast path then return + if (fast_broadcast_x && fast_broadcast_y) { + fast_broadcast = true; + } + if (can_split_y && can_split_x && fast_broadcast) return; + } + + // Should remove memory copy, use reg instead. + if (fast_broadcast) { + return; + } + int x_blocks = 0; + int x_threads = 0; + ComputeBroadcastKernelSize( + x_dims_array, out_dims_array, &x_blocks, &x_threads, max_dim); + int y_blocks = 0; + int y_threads = 0; + ComputeBroadcastKernelSize( + y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim); + + auto x_strides_array_tmp = paddle::memory::Alloc(ctx, bytes); + int *x_strides_array_gpu = + reinterpret_cast(x_strides_array_tmp->ptr()); + paddle::memory::Copy(gplace, + x_strides_array_gpu, + cplace, + x_strides_array.data(), + bytes, + ctx.stream()); + + auto y_strides_array_tmp = paddle::memory::Alloc(ctx, bytes); + int *y_strides_array_gpu = + reinterpret_cast(y_strides_array_tmp->ptr()); + paddle::memory::Copy(gplace, + y_strides_array_gpu, + cplace, + y_strides_array.data(), + bytes, + ctx.stream()); + + auto out_dims_array_tmp = paddle::memory::Alloc(ctx, bytes); + int *out_dims_array_gpu = reinterpret_cast(out_dims_array_tmp->ptr()); + paddle::memory::Copy( + gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream()); + + const int out_size = std::accumulate( + out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); + int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads); + int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads); + if (dx) { + auto x_strides_order_tmp = paddle::memory::Alloc(ctx, bytes); + int *x_strides_order_gpu = + reinterpret_cast(x_strides_order_tmp->ptr()); + paddle::memory::Copy(gplace, + x_strides_order_gpu, + cplace, + x_strides_order.data(), + bytes, + ctx.stream()); + + auto x_dims_order_tmp = paddle::memory::Alloc(ctx, bytes); + int *x_dims_order_gpu = reinterpret_cast(x_dims_order_tmp->ptr()); + paddle::memory::Copy(gplace, + x_dims_order_gpu, + cplace, + x_dims_order.data(), + bytes, + ctx.stream()); + CommonGradBroadcastCUDAKernel< + T, + DX_OP, + Tout><<>>(x_strides_array_gpu, + y_strides_array_gpu, + out_dims_array_gpu, + x_strides_order_gpu, + x_dims_order_gpu, + x_data, + y_data, + out_data, + dout_data, + dx_data, + out_size, + max_dim, + x_threads, + dx_op); + } + if (dy) { + auto y_strides_order_tmp = paddle::memory::Alloc(ctx, bytes); + int *y_strides_order_gpu = + reinterpret_cast(y_strides_order_tmp->ptr()); + paddle::memory::Copy(gplace, + y_strides_order_gpu, + cplace, + y_strides_order.data(), + bytes, + ctx.stream()); + + auto y_dims_order_tmp = paddle::memory::Alloc(ctx, bytes); + int *y_dims_order_gpu = reinterpret_cast(y_dims_order_tmp->ptr()); + paddle::memory::Copy(gplace, + y_dims_order_gpu, + cplace, + y_dims_order.data(), + bytes, + ctx.stream()); + CommonGradBroadcastCUDAKernel< + T, + DY_OP, + Tout><<>>(x_strides_array_gpu, + y_strides_array_gpu, + out_dims_array_gpu, + y_strides_order_gpu, + y_dims_order_gpu, + x_data, + y_data, + out_data, + dout_data, + dy_data, + out_size, + max_dim, + y_threads, + dy_op); + } +} + +template +void CommonElementwiseBroadcastBackward(const GPUContext &ctx, + const DDim &x_dims, + const DDim &y_dims, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + int max_dim = std::max(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + // for inplace strategy. memset will make dx and dout clear and get wrong + // result. + if (dx && dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x_dims, ctx.GetPlace()); + } + + VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" + << paddle::framework::make_ddim(x_dims_array) + << " ydim:" << paddle::framework::make_ddim(y_dims_array); + + CommonGradBroadcastCUDA(x, + y, + out, + dout, + dx, + dy, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + ctx, + dx_op, + dy_op); +} + +template +void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, + const DDim &x_dims, + const DDim &y_dims, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + bool is_xsize_larger = true; + + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + funcs::get_mid_dims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + funcs::get_mid_dims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common backward implementation. + if (is_run_common_broadcast) { + CommonElementwiseBroadcastBackward( + ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + return; + } + if (post == 1) { + ElemwiseGradBroadcast1CUDA( + ctx.stream(), + x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } else { + ElemwiseGradBroadcast2CUDA( + ctx.stream(), + x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + post, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } +} + } // namespace pten From 4c77a9086c488a9a0b11d4e7f0c406c31716345e Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Fri, 14 Jan 2022 15:38:49 +0800 Subject: [PATCH 20/24] Add dygraph sharding stage3 (#38052) --- paddle/pten/core/dense_tensor.cc | 4 + .../meta_parallel/sharding/sharding_stage3.py | 675 ++++++++++++++++++ .../meta_parallel/sharding/sharding_utils.py | 31 +- .../fluid/tests/unittests/CMakeLists.txt | 3 + .../unittests/dygraph_sharding_stage3.py | 233 ++++++ .../unittests/test_dygraph_sharding_stage3.py | 31 + 6 files changed, 960 insertions(+), 17 deletions(-) create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py create mode 100644 python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index 0b5f5cb18e13d..eb6f834d72779 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -435,6 +435,10 @@ inline T* DenseTensor::mutable_data(const paddle::platform::Place& place, } void DenseTensor::ShareBufferWith(const DenseTensor& tensor) { + if (storage_ == nullptr) { + storage_ = make_intrusive( + paddle::platform::CPUPlace()); + } if (storage_ != nullptr && tensor.storage_ != nullptr) { storage_->set_data_shared(tensor.storage_->data_shared()); } diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py new file mode 100644 index 0000000000000..e5d04aac1551e --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py @@ -0,0 +1,675 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy +import time +import contextlib +import logging +import functools +import numpy as np +from itertools import chain +from functools import reduce +from types import MethodType +from collections import deque, OrderedDict + +import paddle +from paddle import nn +from paddle.autograd import PyLayer +import paddle.fluid.core as core +import paddle.distributed as dist +from paddle.fluid.framework import ParamBase +from paddle.fluid.clip import ClipGradByGlobalNorm +from paddle.distributed.collective import _get_global_group + +from .sharding_utils import Type, ShardingClipGrad +from ..pp_utils.utils import _all_gather + +# CUDA alignment 256 bytes +alignment = {"gpu": 256, } +align = { + Type.fp16.value: 2, + Type.fp32.value: 4, +} + +global CHECK_LAYER +CHECK_LAYER = dict() # Help to check layer's id -> layer's name + + +class ShardingStage3(nn.Layer): + """ + A wrapper for Sharding Stage3 Layer in Dygraph. + + .. warning: ShardingStage3 encapsulates the layer strategy and integrates it into the nn.Layer. + + .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf. + """ + + def __init__(self, + layer, + optimizer, + group=None, + sync_buffers=False, + device="gpu", + pertrain_sync_models=True, + accumulate_grads=False, + offload=False, + sync_comm=False): + super().__init__() + + # Default configs + assert core.is_compiled_with_cuda(), "Only support CUDA." + self._layer = layer + self._default_device = device + self.__sync_buffers = sync_buffers + self._accumulate_grads = accumulate_grads + self._offload = offload + self._sync_comm = sync_comm + + # Communication group establishment + self._group = dist.new_group(_get_global_group() + .ranks) if group is None else group + self._world_size_scaling = 1.0 / self._group.nranks + assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1." + self._rank = self._group.rank + self._global_root_rank = 0 # picking rank 0 as the reference + self._global_ranks = self._group.ranks + self._param2buffer_size = dict() # {param.name: size} + self._param2buffer = dict( + ) # {param.name: [(start0, end0),(start1, end1), ...]} + self._trainable_params = dict() # {layer.name: [trainable_params]} + + assert not isinstance( + optimizer, list), "Multiple optimizers are not supported now." + self._optim = _OptimizerWrapper(optimizer, self._offload, self._group, + self._update_params_slice) + self._ori_parameter_list = self._optim._parameter_list + self._ori_param_groups = self._optim._param_groups + + # Replace optimizer's _grad_clip + if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): + logging.warning( + "While using ClipGradByGlobalNorm in ShardingStage3, the grad clip of original optimizer will be changed." + ) + self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip, + paddle.get_device(), + self._group) + + # Synchronous all ranks models + if pertrain_sync_models: + self._sync_params_and_buffers() + + self._segment_rank_params(self._layer) + + # In the first step, record the execution order of the layer + self._order_tracer = OrderedDict() + self._order_tracer["order"] = 0 + self._order_tracer["layer"] = [] + # Register task flow + self._task_flow = TaskFlow() + # Register forward hooks + self._register_forward_hooks(self._layer) + # Register backward parameter hooks + self._register_backward_hooks() + # Redefine optimizer step and clear function + self._redefine_opt_step() + self._redefine_opt_clear() + + @paddle.no_grad() + def _sync_params_and_buffers(self): + """ + Sync all model states for all ranks + """ + + for p in self._layer.parameters(): + dist.broadcast( + p, + src=self._global_root_rank, + group=self._group, + use_calc_stream=True) + + # Multi stream operation will be supported later + dist.wait(tensor=p, group=self._group, use_calc_stream=True) + + def _clear_gradients(self): + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda x: x.trainable, current_layer_params)) + for param in trainable_params: + assert hasattr( + param, "fw_storage" + ), "Find {} don't have fw_storage attribute.".format(param.name) + + # param.bw_storage.zero_() + param.fw_storage.clear_gradient(False) + param.fw_storage._gradient_set_empty(False) + param.bw_storage._clear() + + # Update param memery slice + def _update_params_slice(self): + update_list = self._update_params() + + if not isinstance(self._optim._param_groups[0], dict): + slice_params = [param.fw_storage for param in update_list] + self._optim._parameter_list = slice_params + self._optim._param_groups = slice_params + else: + params_name_list = list(map(lambda p: p.name, update_list)) + for param_group in self._optim._param_groups: + slice_p = [] + for p in param_group['params']: + if p.name in params_name_list: + assert hasattr( + p, "fw_storage" + ), "Find {} don't have fw_storage attribute.".format( + p.name) + slice_p.append(p.fw_storage) + param_group['params'] = slice_p + + def forward(self, *inputs, **kwargs): + """ + A wrapper for Sharding Stage3 layer. + """ + # 1.Sync layer's buffers state + if self.__sync_buffers: + self._sync_buffers() + + # 2.Normal FW on the base model + fw = self._layer(*inputs, **kwargs) + + return fw + + def _segment_rank_params(self, layer, name="last_layer"): + current_layer_params = _current_layer_params(layer) + if current_layer_params: + CHECK_LAYER[id(layer)] = name + self._flatten_layer_params(layer, current_layer_params) + + for name, sub_layer in layer.named_children(): + self._segment_rank_params(sub_layer, name) + + def _flatten_layer_params(self, layer, current_layer_params): + def _add_manage_info(trainable_param): + return _PartitionParam(trainable_param) + + trainable_params = list( + filter(lambda x: x.trainable, current_layer_params)) + assert id(layer) not in self._trainable_params.keys() + self._trainable_params[id(layer)] = list( + map(_add_manage_info, trainable_params)) + + for param in self._trainable_params[id(layer)]: + if param.name in self._param2buffer.keys(): + continue + self._param2buffer[param.name] = [] + # 1.Params alignment + offset = 0 + # CUDA alignment 256 bytes + size = param._numel() * align[param.dtype] + remaining = size % alignment[self._default_device] + ali = 0 if remaining == 0 else alignment[ + self._default_device] - remaining + align_ = ali // align[param.dtype] + + offset = align_ + param._numel() + buffer_size = offset if offset % self._group.nranks == 0 else offset + self._group.nranks - ( + offset % self._group.nranks) + self._param2buffer_size[param.name] = buffer_size + + # 2.Combination param buffer + assert buffer_size % self._group.nranks == 0 + pre_buffer = buffer_size // self._group.nranks + + for rank_ in range(self._group.nranks): + self._param2buffer[param.name].append( + (rank_ * pre_buffer, (rank_ + 1) * pre_buffer)) + + # 3.Flatten layer params and release other rank buffer + self._param_storage(param, buffer_size) + + def _param_storage(self, param, buffer_size): + assert isinstance(buffer_size, int) + value = np.zeros( + buffer_size, + dtype=np.float16) if Type.fp16.value == param.dtype else np.zeros( + buffer_size, dtype=np.float32) + buffer = core.VarBase(value=value, place=core.CPUPlace()) + + param_shape = param.shape + origin_state = param.stop_gradient + param.stop_gradient = True + param.flatten_() + param.stop_gradient = origin_state + start, end = self._param2buffer[param.name][self._rank] + + # Copy the current param value + tmp_var = core.VarBase( + tensor=buffer._slice(0, param._numel()), place=core.CPUPlace()) + param_cpu = param.cpu() + tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(), + core.CPUPlace()) + param.value().get_tensor()._set_dims(param_shape) + param._clear() + + # Current rank param_storage + param.fw_storage = core.VarBase( + buffer._slice(start, end), "slice@" + param.name) + param.status = "part" + + # Updata optimizer master weights + if param.dtype == Type.fp16.value: + self._optim._master_weights[param.fw_storage.name] = paddle.cast( + param.fw_storage, Type.fp32.value) + + def _register_forward_hooks(self, layer): + current_layer_params = _current_layer_params(layer) + if current_layer_params: + self._register_forward_all_hooks(layer, self._task_flow) + + for _, sub_layer in layer.named_children(): + self._register_forward_hooks(sub_layer) + + def _register_forward_all_hooks(self, sub_layer, task_flow): + def _forward_pre_hook(layer, inputs): + return ForwardPreHooks(layer, self._order_tracer, + self._trainable_params, self._param2buffer, + self._rank, self._group, self._sync_comm, + task_flow) + + def _forward_post_hook(layer, inputs, outputs): + return ForwardPostHooks.apply( + outputs, layer, self._order_tracer, self._trainable_params, + self._param2buffer, self._param2buffer_size, self._rank, + self._group, self._sync_comm, task_flow) + + # register previous forward hooks + sub_layer.register_forward_pre_hook(_forward_pre_hook) + + # register post forward hooks + sub_layer.register_forward_post_hook(_forward_post_hook) + + @paddle.no_grad() + def _sync_buffers(self): + for buffer in self._layer.buffers(include_sublayers=True): + dist.broadcast( + buffer, + self._global_root_rank, + self._group, + use_calc_stream=True) + # Multi stream operation will be supported later + dist.wait(tensor=buffer, group=self._group, use_calc_stream=True) + + def __getattr__(self, name): + """Forward missing attributes to wrapped layer.""" + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self._layer, name) + + def _update_params(self): + update_list = [] + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda x: x.trainable, current_layer_params)) + for param in trainable_params: + assert hasattr( + param, + "fw_storage"), "Find {} don't have fw_storage attribute".format( + param.name) + + if self._accumulate_grads: + param.bw_storage.scale_(scale=self._world_size_scaling) + param.fw_storage = _VarBaseWrapper(param) + param.fw_storage._copy_gradient_from(param.bw_storage) + update_list.append(param) + return update_list + + def get_all_parameters(self): + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda x: x.trainable, current_layer_params)) + for param in trainable_params: + if param.use_count > 0: + continue + assert hasattr( + param, + "fw_storage"), "Find {} don't have fw_storage attribute".format( + param.name) + + full_param = _all_gather( + param.fw_storage, self._group, use_calc_stream=True) + dist.wait( + tensor=full_param, group=self._group, use_calc_stream=True) + core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to( + param) + param.value().get_tensor()._set_dims(param.shape) + param.fw_storage._clear() + param.fw_storage = None + param.status = "all" + param.use_count += 1 + + self._optim._parameter_list = self._ori_parameter_list + self._optim._param_groups = self._ori_param_groups + + def _register_backward_hooks(self): + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda x: x.trainable, current_layer_params)) + + for param in trainable_params: + allreduce_function = self._get_allreduce_fn(param) + param._register_backward_hook(allreduce_function) + + def _get_allreduce_fn(self, param): + @paddle.no_grad() + def reduce(*_): + if param.name in self._task_flow.full_grad.keys(): + full_grad = self._task_flow.full_grad[param.name] + with paddle.amp.auto_cast(enable=False): + if not self._accumulate_grads: + full_grad.scale_(scale=self._world_size_scaling) + # Only support sync allreduce current rank's layer now + dist.all_reduce( + tensor=full_grad, + group=self._group, + use_calc_stream=True) + dist.wait( + tensor=full_grad, + group=self._group, + use_calc_stream=True) + + start, end = self._param2buffer[param.name][self._rank] + if not self._accumulate_grads or param.bw_storage is None: + param.bw_storage = core.VarBase( + full_grad._slice(start, end)).detach().clone() + else: + param.bw_storage.add_( + core.VarBase(full_grad._slice(start, end)).detach() + .clone()) + param.clear_gradient(False) + param._gradient_set_empty(False) + tmp_var = self._task_flow.full_grad.pop(param.name) + tmp_var._clear() + + if param.name in self._task_flow.full_param.keys(): + if param.status == "all": + param.use_count = 0 + param._clear() + start, end = self._param2buffer[param.name][self._rank] + with paddle.amp.auto_cast(enable=False): + param.fw_storage = core.VarBase( + self._task_flow.full_param[param.name]._slice(start, + end), + param.name + "@slice").detach().clone() + param.status = "part" + tmp_var = self._task_flow.full_param.pop(param.name) + tmp_var._clear() + + return reduce + + def _redefine_opt_step(self): + params_slice_func = self._update_params_slice + opt_step = self._optim.step + update_scaler = self._optim.update_scaler + + def _opt_step(self): + if not update_scaler: + params_slice_func() + opt_step() + + self._optim.step = MethodType(_opt_step, self._optim) + + def _redefine_opt_clear(self): + clear_func = self._clear_gradients + + def _opt_clear(self): + clear_func() + + self._optim.clear_grad = MethodType(_opt_clear, self._optim) + + +def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer, rank, + group, sync_comm, task_flow): + + # Record layer's id + layer_id = id(layer) + use_calc, sync_wait = False, False + + if layer_id not in order_tracer.keys() or sync_comm: + use_calc, sync_wait = True, True + task_flow.use_calc[layer_id] = use_calc + else: + task_flow.use_calc[layer_id] = use_calc + _wait_layer(trainable_params, layer_id, task_flow, group, use_calc) + + if layer_id == order_tracer["layer"][-1]: return + order_ = order_tracer[layer_id] + layer_id = order_tracer["layer"][order_ + 1] + _allgather_buffer( + layer_id, + trainable_params, + group, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait) + return + + +class ForwardPostHooks(PyLayer): + @staticmethod + def forward(ctx, inputs, layer, order_tracer, trainable_params, + param2buffer, param2buffer_size, rank, group, sync_comm, + task_flow): + _release_param(layer, trainable_params, param2buffer, rank, task_flow) + + layer_id = id(layer) + if layer_id not in order_tracer.keys(): + order_ = order_tracer["order"] + order_tracer[layer_id] = order_ + order_tracer["order"] += 1 + order_tracer["layer"].append(layer_id) + ctx.order_tracer = order_tracer + ctx.task_flow = task_flow + ctx.group = group + ctx.layer = layer + ctx.sync_comm = sync_comm + ctx.trainable_params = trainable_params + ctx.param2buffer_size = param2buffer_size + + return inputs + + @staticmethod + def backward(ctx, *args): + # Load context value + order_tracer = ctx.order_tracer + task_flow = ctx.task_flow + group = ctx.group + layer = ctx.layer + trainable_params = ctx.trainable_params + param2buffer_size = ctx.param2buffer_size + sync_comm = ctx.sync_comm + layer_id = id(layer) + use_calc, sync_wait = False, False + if sync_comm: + use_calc, sync_wait = True, True + _allgather_buffer( + layer_id, + trainable_params, + group, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait) + else: + _wait_layer(trainable_params, layer_id, task_flow, group, use_calc) + _create_params_grad(layer, trainable_params, param2buffer_size, + task_flow) + task_flow.use_calc[layer_id] = use_calc + if layer_id != order_tracer["layer"][0] and not sync_comm: + layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1] + _allgather_buffer( + layer_next_id, + trainable_params, + group, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait) + + return args + + +class TaskFlow: + """ + Task flows, one way linked list for task acquisition. + """ + + def __init__(self, + full_param=dict(), + full_grad=dict(), + use_calc=dict(), + callback=None): + self.full_param = full_param + self.full_grad = full_grad + self.use_calc = use_calc + self.callback = callback + + +def _release_param(layer, trainable_params, param2buffer, rank, task_flow): + for param in trainable_params[id(layer)]: + # async communicate share weight not clear + param.use_count -= 1 + if param.use_count == 0: + param._clear() + if param.name in task_flow.full_param.keys(): + start, end = param2buffer[param.name][rank] + with paddle.amp.auto_cast(enable=False): + param.fw_storage = core.VarBase( + task_flow.full_param[param.name]._slice(start, end), + param.name + "@slice").detach().clone() + param.status = "part" + tmp_var = task_flow.full_param.pop(param.name) + tmp_var._clear() + return + + +def _wait_layer(trainable_params, layer_id, task_flow, group, use_calc_stream): + for param in trainable_params[layer_id]: + if param.status == "all": + param.use_count += 1 + continue + if param.name in task_flow.full_param.keys(): + full_param = task_flow.full_param[param.name] + with paddle.amp.auto_cast(enable=False): + paddle.device.cuda.synchronize() + core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to( + param) + param.value().get_tensor()._set_dims(param.shape) + param.fw_storage._clear() + param.fw_storage = None + param.status = "all" + param.use_count += 1 + else: + _allgather_buffer( + layer_id, + trainable_params, + group, + use_calc_stream, + task_flow, + sync_wait=True) + break + return task_flow + + +def _allgather_buffer(layer_id, + trainable_params, + group, + use_calc_stream, + task_flow, + sync_wait=False): + for param in trainable_params[layer_id]: + if param.status == "all": + param.use_count += 1 + continue + with paddle.amp.auto_cast(enable=False): + full_param = _all_gather( + param.fw_storage, group, use_calc_stream=use_calc_stream) + if sync_wait: + with paddle.amp.auto_cast(enable=False): + dist.wait( + tensor=full_param, + group=group, + use_calc_stream=use_calc_stream) + core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to( + param) + param.value().get_tensor()._set_dims(param.shape) + param.fw_storage._clear() + param.fw_storage = None + param.status = "all" + param.use_count += 1 + task_flow.full_param[param.name] = full_param + return task_flow + + +@paddle.no_grad() +def _create_params_grad(layer, trainable_params, param2buffer_size, task_flow): + for param in trainable_params[id(layer)]: + if param.name in task_flow.full_grad.keys(): + continue + assert isinstance(param2buffer_size[param.name], int) + temp_grad = paddle.zeros( + [param2buffer_size[param.name]], dtype=param.dtype) + param._copy_gradient_from( + core.VarBase(temp_grad._slice(0, param._numel()))) + task_flow.full_grad[param.name] = temp_grad + return task_flow + + +def _PartitionParam(param): + if not hasattr(param, "fw_storage"): + setattr(param, "fw_storage", None) + setattr(param, "bw_storage", None) + setattr(param, "status", "all") + setattr(param, "use_count", 0) + return param + + +def _VarBaseWrapper(param): + varbase = param.fw_storage + tmp_param = ParamBase( + shape=varbase.shape, dtype=varbase.dtype, name="slice@" + param.name) + varbase._share_buffer_to(tmp_param) + tmp_param.regularizer = param.regularizer + tmp_param.optimize_attr['learning_rate'] = param.optimize_attr[ + 'learning_rate'] + varbase._clear() + return tmp_param + + +def _OptimizerWrapper(optimizer, offload, group, update_params_slice): + if not hasattr(optimizer, "_optim"): + setattr(optimizer, "_optim", optimizer) + setattr(optimizer, "offload", offload) + setattr(optimizer, "group", group) + setattr(optimizer, "update_scaler", None) + setattr(optimizer, "update_slice", update_params_slice) + return optimizer + + +def _current_layer_params(layer): + return layer.parameters( + include_sublayers=False) + list(layer.extra_parameters) if hasattr( + layer, "extra_parameters") else layer.parameters( + include_sublayers=False) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 272aada576be8..5f696195c1abc 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -152,6 +152,9 @@ def unscale_method(self, optimizer): param_grads = [] param_grads_fp16 = [] param_grads_fp32 = [] + if hasattr(optimizer, "update_slice"): + optimizer.update_slice() + optimizer.update_scaler = True if getattr(optimizer._optim, '_param_groups', None) and isinstance( optimizer._optim._param_groups[0], dict): @@ -161,27 +164,21 @@ def unscale_method(self, optimizer): if param._grad_ivar() is not None: param_grads.append(param._grad_ivar()) if param._grad_ivar( - ).dtype == core.VarDesc.VarType.FP16: + ).dtype in [core.VarDesc.VarType.FP16, paddle.float16]: param_grads_fp16.append(param._grad_ivar()) else: param_grads_fp32.append(param._grad_ivar()) else: - param_grads = [ - param._grad_ivar() for param in optimizer._optim._parameter_list - if param._grad_ivar() is not None - ] - param_grads_fp16 = [ - param._grad_ivar() for param in optimizer._optim._parameter_list - if (param._grad_ivar() is not None - ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16 - ) - ] - param_grads_fp32 = [ - param._grad_ivar() for param in optimizer._optim._parameter_list - if (param._grad_ivar() is not None - ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32 - ) - ] + for param in optimizer._optim._parameter_list: + if param.grad is not None: + param_grads.append(param.grad) + if param.grad.dtype in [ + core.VarDesc.VarType.FP16, paddle.float16 + ]: + param_grads_fp16.append(param.grad) + else: + param_grads_fp32.append(param.grad) + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 67697fcfd8398..c0c13866ccd55 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -34,6 +34,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2) list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2) +list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3) list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper) @@ -250,6 +251,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2) + list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3) list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers) LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) @@ -1058,6 +1060,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120) + set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py new file mode 100644 index 0000000000000..5b0bec9c454b0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py @@ -0,0 +1,233 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn + +from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 +from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 +from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler + +epoch = 10 +batch_size = 32 +paddle.seed(2021) +np.random.seed(2021) +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 +fleet.init(is_collective=True) + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.AdamW( + parameters=[{ + "params": model.parameters() + }] if opt_group else model.parameters(), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, + sharding_stage, + use_pure_fp16=False, + accumulate_grad=False, + opt_group=False, + recompute=False): + group = paddle.distributed.new_group([0, 1]) + if opt_group: + optimizer = optimizer_setting( + model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group) + else: + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + + if use_pure_fp16: + model = paddle.amp.decorate( + models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + scaler = ShardingScaler(scaler) + if sharding_stage == 2: + optimizer = ShardingOptimizerStage2( + params=model.parameters(), optim=optimizer, group=group) + model = ShardingStage2( + model, + optimizer, + group=group, + buffer_max_size=2**21, + accumulate_grads=accumulate_grad) + elif sharding_stage == 3: + model = ShardingStage3( + model, optimizer=optimizer, group=group, sync_comm=recompute) + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + if not accumulate_grad: + if not use_pure_fp16: + avg_loss.backward() + optimizer.step() + else: + scaler.scale(avg_loss).backward() + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if accumulate_grad: + if not use_pure_fp16: + avg_loss.backward() + optimizer.step() + else: + scaler.scale(avg_loss).backward() + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if sharding_stage == 3: + model.get_all_parameters() + return model.parameters() + + +def test_stage2_stage3(): + mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8 = MLP(), MLP(), MLP( + ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP() + state_dict = mlp.state_dict() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + mlp5.set_state_dict(state_dict) + mlp6.set_state_dict(state_dict) + mlp7.set_state_dict(state_dict) + mlp8.set_state_dict(state_dict) + # fp32 + stage2_params = train_mlp( + mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=True) + stage3_params = train_mlp( + mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=True) + for i in range(len(stage2_params)): + for j in range(len(stage3_params)): + if stage2_params[i].name == stage3_params[j].name: + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[j].numpy(), + rtol=1e-6) + # fp32 accumulate grad + stage2_params = train_mlp( + mlp3, + sharding_stage=2, + use_pure_fp16=False, + accumulate_grad=True, + opt_group=True) + stage3_params = train_mlp( + mlp4, + sharding_stage=3, + use_pure_fp16=False, + accumulate_grad=True, + opt_group=True) + for i in range(len(stage2_params)): + for j in range(len(stage3_params)): + if stage2_params[i].name == stage3_params[j].name: + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[j].numpy(), + rtol=1e-6) + # fp16 + stage2_params = train_mlp( + mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False) + stage3_params = train_mlp( + mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False) + for i in range(len(stage2_params)): + for j in range(len(stage3_params)): + if stage2_params[i].name == stage3_params[j].name: + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[j].numpy(), + rtol=1e-6) + # fp16 recompute + stage3_params = train_mlp( + mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False) + stage3_params_re = train_mlp( + mlp8, + sharding_stage=3, + use_pure_fp16=True, + opt_group=False, + recompute=True) + for i in range(len(stage3_params)): + for j in range(len(stage3_params_re)): + if stage3_params[i].name == stage3_params_re[j].name: + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_re[j].numpy(), + rtol=1e-6) + return + + +if __name__ == '__main__': + test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py new file mode 100644 index 0000000000000..89d5f2e8c7b29 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestDygraphShardingStage3(TestMultipleGpus): + + # check sharding logic as well as the accuracy with single mode + def test_dygraph_sharding_optimizer_stage3(self): + self.run_mnist_2gpu('dygraph_sharding_stage3.py') + + +if __name__ == "__main__": + unittest.main() From 0de8a805a89eb70203163a34858ff504afff30df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Fri, 14 Jan 2022 16:05:00 +0800 Subject: [PATCH 21/24] [infrt] update the version of llvm. test=develop (#38843) --- cmake/external/llvm.cmake | 13 +- paddle/infrt/CMakeLists.txt | 1 - paddle/infrt/common/global.h | 2 +- paddle/infrt/dialect/CMakeLists.txt | 6 +- paddle/infrt/dialect/basic_kernels.cc | 22 +-- paddle/infrt/dialect/basic_kernels.h | 5 +- paddle/infrt/dialect/basic_kernels.td | 7 +- paddle/infrt/dialect/dense_tensor.cc | 148 +++++------------- paddle/infrt/dialect/dense_tensor.h | 51 ++++-- paddle/infrt/dialect/diagnostic_utils.cc | 7 +- paddle/infrt/dialect/diagnostic_utils.h | 6 +- paddle/infrt/dialect/dialect.cc | 16 +- paddle/infrt/dialect/infrt_base.cc | 6 +- paddle/infrt/dialect/infrt_base.h | 32 ++-- paddle/infrt/dialect/infrt_base.td | 6 +- paddle/infrt/dialect/init_infrt_dialects.cc | 12 +- paddle/infrt/dialect/init_infrt_dialects.h | 8 +- paddle/infrt/dialect/mlir_loader.cc | 18 ++- paddle/infrt/dialect/mlir_loader.h | 9 +- paddle/infrt/dialect/mlir_loader_test.cc | 11 +- paddle/infrt/dialect/mlir_tests/rewrite.mlir | 2 +- .../dialect/mlir_tests/rewrite_conv_bn.mlir | 2 +- paddle/infrt/dialect/mlir_tests/trt_ops.mlir | 2 +- paddle/infrt/dialect/ops.td | 6 - paddle/infrt/dialect/opt.cc | 26 +-- paddle/infrt/dialect/pd_op_base.td | 2 +- paddle/infrt/dialect/pd_ops.cc | 29 ++-- paddle/infrt/dialect/pd_ops.h | 36 ++--- paddle/infrt/dialect/pd_ops.td | 14 +- paddle/infrt/dialect/pd_types.h | 11 +- paddle/infrt/dialect/print_ir.cc | 45 +++--- paddle/infrt/dialect/tensor_shape.cc | 16 +- paddle/infrt/dialect/tensor_shape.h | 8 +- paddle/infrt/dialect/tensor_shape_base.td | 4 +- paddle/infrt/dialect/tensorrt/trt_exec.cc | 4 +- .../dialect/tensorrt/trt_graph_fuse_pass.cc | 78 +++++---- .../dialect/tensorrt/trt_graph_fuse_pass.h | 12 +- .../dialect/tensorrt/trt_graph_split_pass.cc | 20 +-- .../dialect/tensorrt/trt_graph_split_pass.h | 10 +- .../dialect/tensorrt/trt_op_teller_pass.cc | 25 ++- .../dialect/tensorrt/trt_op_teller_pass.h | 14 +- paddle/infrt/dialect/tensorrt/trt_ops.cc | 22 ++- paddle/infrt/dialect/tensorrt/trt_ops.h | 41 +++-- paddle/infrt/dialect/test_kernels.cc | 75 ++++----- paddle/infrt/dialect/test_kernels.h | 7 +- paddle/infrt/dialect/types.cc | 17 -- paddle/infrt/dialect/types.h | 16 -- paddle/infrt/host_context/core_runtime.cc | 6 +- paddle/infrt/host_context/core_runtime.h | 6 +- paddle/infrt/host_context/kernel_frame.h | 6 +- .../host_context/kernel_registry_test.cc | 6 +- .../infrt/host_context/kernel_utils_test.cc | 6 +- .../host_context/mlir_function_executable.cc | 1 + .../host_context/mlir_function_executable.h | 3 +- .../host_context/mlir_program_executor.h | 4 +- .../host_context/mlir_to_runtime_translate.cc | 90 ++++++----- .../host_context/mlir_to_runtime_translate.h | 8 +- .../mlir_to_runtime_translate_test.cc | 12 +- paddle/infrt/host_context/op_executable.cc | 7 +- paddle/infrt/host_context/op_executable.h | 12 +- paddle/infrt/kernel/basic_kernels.cc | 6 +- paddle/infrt/kernel/basic_kernels.h | 12 +- paddle/infrt/kernel/tensor_kernels.cc | 6 +- paddle/infrt/kernel/tensor_kernels.h | 12 +- paddle/infrt/kernel/tensor_shape_kernels.cc | 6 +- paddle/infrt/kernel/tensor_shape_kernels.h | 12 +- paddle/infrt/kernel/test_kernels.cc | 6 +- paddle/infrt/kernel/test_kernels.h | 12 +- paddle/infrt/paddle/cpp/desc_api.h | 8 +- paddle/infrt/paddle/model_parser.cc | 6 +- paddle/infrt/paddle/model_parser.h | 6 +- paddle/infrt/paddle/pb/block_desc.cc | 8 +- paddle/infrt/paddle/pb/block_desc.h | 8 +- paddle/infrt/paddle/pb/op_desc.cc | 8 +- paddle/infrt/paddle/pb/op_desc.h | 8 +- paddle/infrt/paddle/pb/program_desc.cc | 8 +- paddle/infrt/paddle/pb/program_desc.h | 8 +- paddle/infrt/paddle/pb/var_desc.cc | 8 +- paddle/infrt/paddle/pb/var_desc.h | 8 +- 79 files changed, 616 insertions(+), 637 deletions(-) delete mode 100644 paddle/infrt/dialect/ops.td delete mode 100644 paddle/infrt/dialect/types.cc delete mode 100644 paddle/infrt/dialect/types.h diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index e080a7359af98..27210e5260048 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -1,7 +1,7 @@ include(FetchContent) -set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz) -set(LLVM_MD5 39d32b6be466781dddf5869318dcba53) +set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz) +set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e) set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm) set(FETCHCONTENT_QUIET OFF) @@ -51,7 +51,7 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") # To build with MLIR, the LLVM is build from source code using the following flags: #[==[ -cmake -G Ninja ../llvm \ +cmake ../llvm -G "Unix Makefiles" \ -DLLVM_ENABLE_PROJECTS="mlir;clang" \ -DLLVM_BUILD_EXAMPLES=OFF \ -DLLVM_TARGETS_TO_BUILD="X86" \ @@ -59,8 +59,10 @@ cmake -G Ninja ../llvm \ -DLLVM_ENABLE_ASSERTIONS=ON \ -DLLVM_ENABLE_ZLIB=OFF \ -DLLVM_ENABLE_RTTI=ON \ + -DLLVM_INSTALL_UTILS=ON \ + -DCMAKE_INSTALL_PREFIX=./install #]==] -# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit) +# The matched llvm-project version is b5149f4e66a49a98b67e8e2de4e24a4af8e2781b (currently a temporary commit) add_definitions(${LLVM_DEFINITIONS}) @@ -75,7 +77,7 @@ add_definitions(${LLVM_DEFINITIONS}) # The minimum needed libraries for MLIR IR parse and transform. -set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib) +set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib) # tb_base is the name of a xxx.td file (without the .td suffix) @@ -89,6 +91,7 @@ function(mlir_tablegen_on td_base) mlir_tablegen(${td_base}.cpp.inc -gen-op-defs) if (mlir_tablegen_on_DIALECT) mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT}) + mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs -dialect=${mlir_tablegen_on_DIALECT}) endif() add_public_tablegen_target(${td_base}_IncGen) add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 8f05d286bf033..8af3012a220ad 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -77,7 +77,6 @@ add_subdirectory(paddle) # MLIR td file generations set(infrt_mlir_incs - ops_inc basic_kernels_inc test_kernels_inc infrt_base_inc diff --git a/paddle/infrt/common/global.h b/paddle/infrt/common/global.h index f89164d03f31d..e6586cb3a3c60 100644 --- a/paddle/infrt/common/global.h +++ b/paddle/infrt/common/global.h @@ -14,7 +14,7 @@ #pragma once -#include "mlir/IR/MLIRContext.h" +#include #include "paddle/infrt/tensor/dense_host_tensor.h" namespace infrt { diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index d145843684c63..c064b2145266b 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -2,7 +2,6 @@ core_gather_headers() gather_srcs(infrt_src SRCS dialect.cc - types.cc basic_kernels.cc test_kernels.cc infrt_base.cc @@ -14,8 +13,6 @@ gather_srcs(infrt_src SRCS pd_types.cc pd_ops.cc ) - -mlir_tablegen_on(ops) mlir_tablegen_on(basic_kernels) mlir_tablegen_on(test_kernels) mlir_tablegen_on(infrt_base DIALECT infrt) @@ -27,8 +24,7 @@ mlir_add_rewriter(rewrite) # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code add_executable(infrtopt opt.cc) -target_link_libraries(infrtopt infrt ${mlir_libs}) -add_dependencies(infrtopt infrt) +target_link_libraries(infrtopt infrt) add_executable(print-ir print_ir.cc) target_link_libraries(print-ir infrt ${mlir_libs}) diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc index b4d2b9182b0c5..bad7e73ec5ae5 100644 --- a/paddle/infrt/dialect/basic_kernels.cc +++ b/paddle/infrt/dialect/basic_kernels.cc @@ -17,17 +17,17 @@ #include #include #include -#include -#include +#include +#include #include #include -#include #include #include #include "paddle/infrt/dialect/dense_tensor.h" -namespace infrt::dialect { +namespace infrt { +namespace dialect { using namespace mlir; // NOLINT static ParseResult parseCallOp(OpAsmParser &parser, // NOLINT @@ -71,12 +71,12 @@ static ParseResult parseConstantF64Op(OpAsmParser &parser, // NOLINT static ParseResult parseConstantI32Op(OpAsmParser &parser, // NOLINT OperationState &result) { // NOLINT return parseConstantOp( - IntegerType::get(32, result.getContext()), parser, result); + IntegerType::get(result.getContext(), 32), parser, result); } static ParseResult parseConstantI64Op(OpAsmParser &parser, // NOLINT OperationState &result) { // NOLINT return parseConstantOp( - IntegerType::get(64, result.getContext()), parser, result); + IntegerType::get(result.getContext(), 64), parser, result); } static ParseResult parseReturnOp(OpAsmParser &parser, // NOLINT @@ -90,10 +90,10 @@ static ParseResult parseReturnOp(OpAsmParser &parser, // NOLINT } static void print(OpAsmPrinter &p, CallOp op) { // NOLINT - p << "infrt.call " << op.getAttr("callee") << "("; + p << "infrt.call " << op->getAttr("callee") << "("; p.printOperands(op.getOperands()); p << ")"; - p.printOptionalAttrDict(op.getAttrs(), {"callee"}); + p.printOptionalAttrDict(op->getAttrs(), {"callee"}); p << " : "; } @@ -145,7 +145,7 @@ static LogicalResult verify(ConstantF64Op op) { return success(); } static LogicalResult verify(ConstantI64Op op) { return success(); } static LogicalResult verify(ReturnOp op) { - auto function = dyn_cast(op.getParentOp()); + auto function = dyn_cast(op->getParentOp()); if (!function) return success(); @@ -157,8 +157,8 @@ static LogicalResult verify(ReturnOp op) { return success(); } +} // namespace dialect +} // namespace infrt #define GET_OP_CLASSES #include "paddle/infrt/dialect/basic_kernels.cpp.inc" - -} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/basic_kernels.h index 65316bc1437c0..b82abcd52d28f 100644 --- a/paddle/infrt/dialect/basic_kernels.h +++ b/paddle/infrt/dialect/basic_kernels.h @@ -13,12 +13,9 @@ // limitations under the License. #pragma once +#include #include #include -using namespace mlir; // NOLINT - -namespace infrt::dialect { #define GET_OP_CLASSES #include "paddle/infrt/dialect/basic_kernels.hpp.inc" -} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td index df5e4d8a2c6a1..7d8de79fbae2b 100644 --- a/paddle/infrt/dialect/basic_kernels.td +++ b/paddle/infrt/dialect/basic_kernels.td @@ -27,7 +27,7 @@ def CallOp : INFRT_Op<"call"> { let results = (outs Variadic); let extraClassDeclaration = [{ - StringRef getCallee() { return callee(); } + mlir::StringRef getCallee() { return callee(); } mlir::FunctionType getCalleeType(); }]; } @@ -57,9 +57,8 @@ def ReturnOp : INFRT_Op<"return", [Terminator]> { let arguments = (ins Variadic:$operands); - let builders = [OpBuilder< - "OpBuilder &b, OperationState &result", - [{ build(b, result, llvm::None); }]>]; + let builders = [OpBuilder<(ins), + [{ build($_builder, $_state, llvm::None); }]>]; } class AddOp : INFRT_Op<"add." # suffix, [NoSideEffect]> { diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc index 629a7b16523fc..7685cdc65b9ad 100644 --- a/paddle/infrt/dialect/dense_tensor.cc +++ b/paddle/infrt/dialect/dense_tensor.cc @@ -17,12 +17,11 @@ #include #include #include +#include +#include #include -#include -#include #include #include -#include #include #include @@ -31,68 +30,37 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/tensor_shape.h" -namespace infrt::dt { - +namespace infrt { +namespace dt { void DTDialect::initialize() { - allowUnknownTypes(); addOperations< #define GET_OP_LIST #include "paddle/infrt/dialect/dense_tensor.cpp.inc" >(); } -namespace detail { -struct TensorTypeStorage : public mlir::TypeStorage { - TensorTypeStorage(TargetType target, - LayoutType layout, - PrecisionType precision) - : target_(target), layout_(layout), precision_(precision) {} - - using KeyTy = std::tuple; - - bool operator==(const KeyTy &key) const { - return key == KeyTy(target_, layout_, precision_); - } - - static llvm::hash_code hashKey(const KeyTy &key) { - return llvm::hash_value(key); - } - - static TensorTypeStorage *construct( - mlir::TypeStorageAllocator &allocator, // NOLINT - const KeyTy &key) { - return new (allocator.allocate()) - TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key)); - } - - TargetType target_; - LayoutType layout_; - PrecisionType precision_; -}; -} // namespace detail - llvm::Optional GetTargetType(mlir::StringRef key) { - if (key.equals_lower("x86")) + if (key.equals_insensitive("x86")) return TargetType::X86; - else if (key.equals_lower("cuda")) + else if (key.equals_insensitive("cuda")) return TargetType::CUDA; else return llvm::None; } llvm::Optional GetLayoutType(mlir::StringRef key) { - if (key.equals_lower("nchw")) + if (key.equals_insensitive("nchw")) return LayoutType::NCHW; - else if (key.equals_lower("nhwc")) + else if (key.equals_insensitive("nhwc")) return LayoutType::NHWC; else return llvm::None; } llvm::Optional GetPrecisionType(mlir::StringRef key) { - if (key.equals_lower("i32")) + if (key.equals_insensitive("i32")) return PrecisionType::I32; - else if (key.equals_lower("f32")) + else if (key.equals_insensitive("f32")) return PrecisionType::F32; else return llvm::None; @@ -111,7 +79,7 @@ LayoutType TensorType::layout() { return getImpl()->layout_; } PrecisionType TensorType::precision() { return getImpl()->precision_; } -raw_ostream &operator<<(raw_ostream &os, TensorType tensorType) { +mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType) { os << "TensorType<" << tensorType.target() << ", " << tensorType.layout() << ", " << tensorType.precision() << ">"; return os; @@ -133,7 +101,7 @@ StringType StringType::get(mlir::MLIRContext *context) { return Base::get(context); } -raw_ostream &operator<<(raw_ostream &os, TargetType type) { +mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type) { switch (type) { case (TargetType::X86): os << "X86"; @@ -147,7 +115,7 @@ raw_ostream &operator<<(raw_ostream &os, TargetType type) { return os; } -raw_ostream &operator<<(raw_ostream &os, LayoutType type) { +mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type) { switch (type) { case (LayoutType::NCHW): os << "NCHW"; @@ -161,7 +129,7 @@ raw_ostream &operator<<(raw_ostream &os, LayoutType type) { return os; } -raw_ostream &operator<<(raw_ostream &os, PrecisionType type) { +mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type) { switch (type) { case (PrecisionType::I32): os << "I32"; @@ -175,103 +143,69 @@ raw_ostream &operator<<(raw_ostream &os, PrecisionType type) { return os; } -static Type getTensorType(mlir::MLIRContext *context) { - auto t_dialect = Identifier::get("t", context); - return OpaqueType::get(t_dialect, "tensor", context); +static mlir::Type getTensorType(mlir::MLIRContext *context) { + auto t_dialect = mlir::Identifier::get("t", context); + return mlir::OpaqueType::get(t_dialect, "tensor"); } -static ParseResult parseCreateUninitTensorOp( - OpAsmParser &parser, // NOLINT - OperationState &result) { // NOLINT +static mlir::ParseResult parseCreateUninitTensorOp( + mlir::OpAsmParser &parser, // NOLINT + mlir::OperationState &result) { // NOLINT auto loc = parser.getCurrentLocation(); - ::mlir::Type outputRawTypes[1]; - ::llvm::ArrayRef<::mlir::Type> outputTypes(outputRawTypes); + mlir::Type outputRawTypes[1]; + ::llvm::ArrayRef outputTypes(outputRawTypes); mlir::ArrayAttr shapeAttr; if (parser.parseAttribute(shapeAttr, parser.getBuilder().getI64Type(), "shape", result.attributes)) - return failure(); - if (parser.parseOptionalAttrDict(result.attributes)) return failure(); + return mlir::failure(); + if (parser.parseOptionalAttrDict(result.attributes)) return mlir::failure(); - if (parser.parseArrow()) return failure(); - if (parser.parseType(outputRawTypes[0])) return failure(); + if (parser.parseArrow()) return mlir::failure(); + if (parser.parseType(outputRawTypes[0])) return mlir::failure(); if (!outputRawTypes[0].isa()) return parser.emitError(loc, "invalid kind of type specified"); result.addTypes(outputTypes); - return success(); + return mlir::success(); } template -static void printCreateUninitTensorOp(OpAsmPrinter &p, // NOLINT +static void printCreateUninitTensorOp(mlir::OpAsmPrinter &p, // NOLINT CreateUninitTensorOp op) { p << CreateUninitTensorOp::getOperationName(); p << " "; p.printAttributeWithoutType(op.shapeAttr()); - p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"shape"}); + p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"shape"}); p << " -> "; p << op.getOperation()->getResultTypes(); } -// TODO(shibo): can be removed? -// static ParseResult parseFillTensorWithConstantOp(OpAsmParser& parser, -// OperationState& result) { -// auto loc = parser.getCurrentLocation(); -// ::mlir::OpAsmParser::OperandType inputRawOperands[1]; -// ::llvm::ArrayRef<::mlir::OpAsmParser::OperandType> -// inputOperands(inputRawOperands); -// ::mlir::Type inputRawTypes[1]; -// ::llvm::ArrayRef<::mlir::Type> inputTypes(inputRawTypes); -// -// if (parser.parseOperand(inputRawOperands[0])) return failure(); -// -// if (parser.parseColon()) return failure(); -// if (parser.parseType(inputRawTypes[0])) return failure(); -// if (!inputRawTypes[0].isa()) -// return parser.emitError(loc, "invalid kind of type specified"); -// -// Attribute value_attr; -// if (parser.resolveOperands(inputOperands, inputTypes, loc, result.operands)) -// return failure(); -// if (parser.parseAttribute(value_attr, "value", result.attributes)) return -// failure(); -// return success(); -//} - -// TODO(shibo): can be removed? -// template -// static void printFillTensorWithConstantOp(OpAsmPrinter& p, FillTensorOp op) { -// p << FillTensorOp::getOperationName(); -// p << " "; -// p.printOperand(op.getOperand()); -// p << " : "; -// p << op.getOperation()->getOperandTypes(); -// p << " "; -// p << op.getAttr("value"); -//} - -static ParseResult parseSetTensorOp(OpAsmParser &parser, // NOLINT - OperationState &result) { // NOLINT - SmallVector operands; - if (parser.parseOperandList(operands, 1)) return failure(); +static mlir::ParseResult parseSetTensorOp( + mlir::OpAsmParser &parser, // NOLINT + mlir::OperationState &result) { // NOLINT + llvm::SmallVector operands; + if (parser.parseOperandList(operands, 1)) return mlir::failure(); auto tensor_type = getTensorType(result.getContext()); - Attribute value_attr; - return failure( + mlir::Attribute value_attr; + return mlir::failure( parser.resolveOperand(operands[0], tensor_type, result.operands) || parser.parseAttribute(value_attr, "values", result.attributes)); } template -static void printSetTensorOp(OpAsmPrinter &p, SetTensorOp op) { // NOLINT +static void printSetTensorOp(mlir::OpAsmPrinter &p, SetTensorOp op) { // NOLINT p << SetTensorOp::getOperationName() << " "; p.printOperand(op.getOperand()); - p << " " << op.getAttr("values"); + p << " " << op->getAttr("values"); } +} // namespace dt +} // namespace infrt #define GET_OP_CLASSES #include "paddle/infrt/dialect/dense_tensor.cpp.inc" // NOLINT -} // namespace infrt::dt +#include "paddle/infrt/dialect/dense_tensor_dialect.cpp.inc" diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h index 866c62213ab05..416925d3382ba 100644 --- a/paddle/infrt/dialect/dense_tensor.h +++ b/paddle/infrt/dialect/dense_tensor.h @@ -19,13 +19,8 @@ #include -using namespace mlir; // NOLINT -namespace infrt::dt { - -namespace detail { -struct TensorTypeStorage; -} // namespace detail - +namespace infrt { +namespace dt { enum class TargetType : uint8_t { X86, CUDA }; enum class LayoutType : uint8_t { NCHW, NHWC }; enum class PrecisionType : uint8_t { I32, F32 }; @@ -34,9 +29,39 @@ llvm::Optional GetTargetType(mlir::StringRef key); llvm::Optional GetLayoutType(mlir::StringRef key); llvm::Optional GetPrecisionType(mlir::StringRef key); -raw_ostream &operator<<(raw_ostream &os, TargetType type); -raw_ostream &operator<<(raw_ostream &os, LayoutType type); -raw_ostream &operator<<(raw_ostream &os, PrecisionType type); +mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type); +mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type); +mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type); + +namespace detail { +struct TensorTypeStorage : public mlir::TypeStorage { + TensorTypeStorage(TargetType target, + LayoutType layout, + PrecisionType precision) + : target_(target), layout_(layout), precision_(precision) {} + + using KeyTy = std::tuple; + + bool operator==(const KeyTy &key) const { + return key == KeyTy(target_, layout_, precision_); + } + + static llvm::hash_code hashKey(const KeyTy &key) { + return llvm::hash_value(key); + } + + static TensorTypeStorage *construct( + mlir::TypeStorageAllocator &allocator, // NOLINT + const KeyTy &key) { + return new (allocator.allocate()) + TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key)); + } + + TargetType target_; + LayoutType layout_; + PrecisionType precision_; +}; +} // namespace detail class TensorType : public mlir::Type::TypeBase #include -namespace infrt::dialect { +namespace infrt { +namespace dialect { struct MyScopedDiagnosicHandler::Impl { Impl() : diag_stream_(diag_str_) {} @@ -49,4 +51,5 @@ mlir::LogicalResult MyScopedDiagnosicHandler::handler(mlir::Diagnostic *diag) { return mlir::failure(true); } -} // namespace infrt::dialect +} // namespace dialect +} // namespace infrt diff --git a/paddle/infrt/dialect/diagnostic_utils.h b/paddle/infrt/dialect/diagnostic_utils.h index 3a8098cf75181..746e61c8fe5c3 100644 --- a/paddle/infrt/dialect/diagnostic_utils.h +++ b/paddle/infrt/dialect/diagnostic_utils.h @@ -18,7 +18,8 @@ #include -namespace infrt::dialect { +namespace infrt { +namespace dialect { /** * A scoped diagnostic handler to help debug MLIR process. @@ -36,4 +37,5 @@ class MyScopedDiagnosicHandler : public mlir::SourceMgrDiagnosticHandler { std::unique_ptr impl_; }; -} // namespace infrt::dialect +} // namespace dialect +} // namespace infrt diff --git a/paddle/infrt/dialect/dialect.cc b/paddle/infrt/dialect/dialect.cc index cbcd5d0f0fa78..fe07b91d22ed5 100644 --- a/paddle/infrt/dialect/dialect.cc +++ b/paddle/infrt/dialect/dialect.cc @@ -13,24 +13,26 @@ // limitations under the License. #include +#include #include -#include #include #include -#include #include #include -namespace infrt::hlir::dialect { +namespace infrt { +namespace hlir { +namespace dialect { -class CinnDialect : public ::mlir::Dialect { +class CinnDialect : public mlir::Dialect { public: - explicit CinnDialect(::mlir::MLIRContext* ctx); + explicit CinnDialect(mlir::MLIRContext* ctx); //! We should register this function in dialect static llvm::StringRef getDialectNamespace() { return "infrt::hlir::dialect"; } }; - -} // namespace infrt::hlir::dialect +} // namespace dialect +} // namespace hlir +} // namespace infrt diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc index b28ad5ad4b5a5..e8005661bbd65 100644 --- a/paddle/infrt/dialect/infrt_base.cc +++ b/paddle/infrt/dialect/infrt_base.cc @@ -18,7 +18,8 @@ #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/test_kernels.h" -namespace infrt::dialect { +namespace infrt { +namespace dialect { // ----INFRTDialect definition begin---- void INFRTDialect::initialize() { @@ -124,4 +125,5 @@ void INFRTDialect::printType(mlir::Type type, // ----INFRTDialect definition end---- -} // namespace infrt::dialect +} // namespace dialect +} // namespace infrt diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h index 58acd7c9a409a..1a7fbcf395a6e 100644 --- a/paddle/infrt/dialect/infrt_base.h +++ b/paddle/infrt/dialect/infrt_base.h @@ -18,19 +18,17 @@ #include #include #include -#include #include #include #include "paddle/infrt/dialect/infrt_base.hpp.inc" -namespace infrt::dialect { - -class INFRTDialect : public ::mlir::Dialect { - explicit INFRTDialect(::mlir::MLIRContext *context) - : ::mlir::Dialect(getDialectNamespace(), - context, - ::mlir::TypeID::get()) { +namespace infrt { +namespace dialect { +class INFRTDialect : public mlir::Dialect { + explicit INFRTDialect(mlir::MLIRContext *context) + : mlir::Dialect( + getDialectNamespace(), context, mlir::TypeID::get()) { initialize(); } @@ -41,15 +39,12 @@ class INFRTDialect : public ::mlir::Dialect { mlir::DialectAsmPrinter &printer) const override; void initialize(); - friend class ::mlir::MLIRContext; + friend class mlir::MLIRContext; public: static ::llvm::StringRef getDialectNamespace() { return "infrt"; } }; - -} // namespace infrt::dialect - -namespace mlir { +} // namespace dialect template static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b, // NOLINT @@ -58,17 +53,16 @@ static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b, // NOLINT return b.getIntegerAttr(b.getI32Type(), constant); } -static mlir::SmallVector<::mlir::Value, 4> cvtValueToValueRange( +static mlir::SmallVector cvtValueToValueRange( const mlir::Value &operand) { - return mlir::SmallVector<::mlir::Value, 4>(1, operand); + return mlir::SmallVector(1, operand); } -static mlir::SmallVector<::mlir::Value, 4> concatTwoValueRange( +static mlir::SmallVector concatTwoValueRange( mlir::ValueRange operand_0, mlir::ValueRange operand_1) { - mlir::SmallVector<::mlir::Value, 4> operands; + mlir::SmallVector operands; operands.append(operand_0.begin(), operand_0.end()); operands.append(operand_1.begin(), operand_1.end()); return operands; } - -} // namespace mlir +} // namespace infrt diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td index 7d6fdbbbf2f68..1abd294236d93 100644 --- a/paddle/infrt/dialect/infrt_base.td +++ b/paddle/infrt/dialect/infrt_base.td @@ -28,11 +28,11 @@ def TensorMapType : def BufferType : OpaqueType<"b", "buffer", "buffer">; class INFRT_createI32Attr : NativeCodeCall< - "mlir::createI32Attr($_builder, $_loc, " # value # ")">; + "infrt::createI32Attr($_builder, $_loc, " # value # ")">; def INFRT_cvtValueToValueRange : NativeCodeCall< - "mlir::cvtValueToValueRange($0)">; + "infrt::cvtValueToValueRange($0)">; def INFRT_concatTwoValueRange : NativeCodeCall< - "mlir::concatTwoValueRange($0, $1)">; + "infrt::concatTwoValueRange($0, $1)">; #endif // INFRT_BASE diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc index 4bc2bf70942d2..c3769414dbb39 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_infrt_dialects.cc @@ -23,12 +23,10 @@ #include "paddle/infrt/dialect/tensor_shape.h" namespace infrt { - -void RegisterCinnDialects(mlir::DialectRegistry& registry) { // NOLINT - registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); +void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT + registry.insert(); } - } // namespace infrt diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_infrt_dialects.h index 50caca018980d..0912e9ef2555b 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.h +++ b/paddle/infrt/dialect/init_infrt_dialects.h @@ -14,10 +14,8 @@ #pragma once -#include "mlir/IR/Dialect.h" - +#include +#include namespace infrt { - -void RegisterCinnDialects(mlir::DialectRegistry& registry); // NOLINT - +void registerCinnDialects(mlir::DialectRegistry ®istry); // NOLINT } // namespace infrt diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc index b318a6a763483..1d0696e77dcda 100644 --- a/paddle/infrt/dialect/mlir_loader.cc +++ b/paddle/infrt/dialect/mlir_loader.cc @@ -16,8 +16,8 @@ #include #include +#include #include -#include #include #include #include @@ -30,12 +30,15 @@ #include "paddle/infrt/dialect/diagnostic_utils.h" #include "paddle/infrt/dialect/init_infrt_dialects.h" -namespace infrt::dialect { +namespace infrt { +namespace dialect { mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context, const std::string& mlir_source) { // context->allowUnregisteredDialects(); - RegisterCinnDialects(context->getDialectRegistry()); + mlir::DialectRegistry registry; + registerCinnDialects(registry); + context->appendDialectRegistry(registry); // Currenetly, We only used the CinnDialect and mlir::BuiltinDialect is // enough。Don't need StandardOpsDialect. // context->getDialectRegistry().insert(); @@ -57,9 +60,9 @@ mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context, mlir::OwningModuleRef LoadMlirFile(const std::string& file_name, mlir::MLIRContext* context) { // context->allowUnregisteredDialects(); - RegisterCinnDialects(context->getDialectRegistry()); - context->getDialectRegistry().insert(); - + mlir::DialectRegistry registry; + registerCinnDialects(registry); + context->appendDialectRegistry(registry); mlir::ScopedDiagnosticHandler scope_handler( context, [](mlir::Diagnostic& diag) { if (diag.getSeverity() != mlir::DiagnosticSeverity::Error) @@ -71,4 +74,5 @@ mlir::OwningModuleRef LoadMlirFile(const std::string& file_name, return mlir::parseSourceFile(std::string(file_name), context); } -} // namespace infrt::dialect +} // namespace dialect +} // namespace infrt diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h index 092da7d9ce03f..5e50ad9e5a271 100644 --- a/paddle/infrt/dialect/mlir_loader.h +++ b/paddle/infrt/dialect/mlir_loader.h @@ -15,16 +15,17 @@ #pragma once #include -#include +#include #include #include -namespace infrt::dialect { +namespace infrt { +namespace dialect { mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context, const std::string& mlir_source); mlir::OwningModuleRef LoadMlirFile(const std::string& file_name, mlir::MLIRContext* context); - -} // namespace infrt::dialect +} // namespace dialect +} // namespace infrt diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc index 1b622d585ad8e..1115053073044 100644 --- a/paddle/infrt/dialect/mlir_loader_test.cc +++ b/paddle/infrt/dialect/mlir_loader_test.cc @@ -17,14 +17,15 @@ #include #include #include -#include +#include #include #include #include "paddle/infrt/dialect/init_infrt_dialects.h" -namespace infrt::dialect { +namespace infrt { +namespace dialect { TEST(MlirLoader, basic) { mlir::MLIRContext context; @@ -42,8 +43,7 @@ func @main() -> f32 { )ROC"; auto module = LoadMlirSource(&context, source); - module->verify(); - + EXPECT_TRUE(mlir::succeeded(module->verify())); LOG(INFO) << "module name: " << module->getOperationName().data(); for (auto func : module->getOps()) { LOG(INFO) << "get func " << func.getName().str(); @@ -54,4 +54,5 @@ func @main() -> f32 { } } -} // namespace infrt::dialect +} // namespace dialect +} // namespace infrt diff --git a/paddle/infrt/dialect/mlir_tests/rewrite.mlir b/paddle/infrt/dialect/mlir_tests/rewrite.mlir index bfad9d1f6924d..5e207634da8e4 100644 --- a/paddle/infrt/dialect/mlir_tests/rewrite.mlir +++ b/paddle/infrt/dialect/mlir_tests/rewrite.mlir @@ -20,5 +20,5 @@ func @main() -> tensor { %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor, tensor) -> tensor %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor - infrt.return %e2 : tensor + "pd.fetch"(%e2) {name="output"} :(tensor)->() } \ No newline at end of file diff --git a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir index 9ea1ec0ebca36..2889b92b18ef0 100644 --- a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir +++ b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir @@ -11,5 +11,5 @@ func @main() -> tensor { %c = "pd.conv2d"(%a, %filter, %bias) {} : (tensor, tensor<3x64x3x3xf32>, tensor<64xf32>) -> tensor %d = "pd.batch_norm"(%c, %scale, %bias2, %mean, %var) {} : (tensor, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor - infrt.return %d : tensor + "pd.fetch"(%d) {name="output"} :(tensor)->() } \ No newline at end of file diff --git a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir index 009b6d1c19653..d98f107bab41e 100644 --- a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir +++ b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir @@ -18,5 +18,5 @@ func @main() -> tensor { %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor, tensor) -> tensor %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor - "pd.fetch"(%e2) :(tensor)->() + "pd.fetch"(%e2) {name="output"} :(tensor)->() } diff --git a/paddle/infrt/dialect/ops.td b/paddle/infrt/dialect/ops.td deleted file mode 100644 index 264134a447c63..0000000000000 --- a/paddle/infrt/dialect/ops.td +++ /dev/null @@ -1,6 +0,0 @@ -include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt_base.td" - - -class INFRT_Op traits = []> : - Op; diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc index d90d25230d0c2..5bcf5a23f4c53 100644 --- a/paddle/infrt/dialect/opt.cc +++ b/paddle/infrt/dialect/opt.cc @@ -12,34 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include - -#include - -#include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/init_infrt_dialects.h" -#include "paddle/infrt/dialect/mlir_loader.h" int main(int argc, char **argv) { - mlir::MLIRContext *context = infrt::Global::getMLIRContext(); - - auto ®istry = context->getDialectRegistry(); - infrt::RegisterCinnDialects(registry); - + mlir::DialectRegistry registry; + infrt::registerCinnDialects(registry); mlir::registerCanonicalizerPass(); - return mlir::failed( - mlir::MlirOptMain(argc, argv, "INFRT mlir pass driver", registry)); + mlir::MlirOptMain(argc, argv, "infrt mlir pass driver", registry)); } diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td index af53df113dfb3..a3e3c4ae59277 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd_op_base.td @@ -16,7 +16,7 @@ def PD_Dialect : Dialect { This dialect contains the PaddlePaddle operators. }]; - let cppNamespace = "::mlir::pd"; + let cppNamespace = "mlir::pd"; } class PD_Op traits = []> : diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc index ce10be6d100f8..fe38996883846 100644 --- a/paddle/infrt/dialect/pd_ops.cc +++ b/paddle/infrt/dialect/pd_ops.cc @@ -14,10 +14,15 @@ #include "paddle/infrt/dialect/pd_ops.h" -#include "mlir/IR/Matchers.h" -#include "mlir/IR/PatternMatch.h" +#include +#include #include "paddle/infrt/dialect/infrt_base.h" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT + +#include "paddle/infrt/dialect/rewrite.hpp.inc" // NOLINT + namespace mlir { namespace pd { PaddleDialect::PaddleDialect(MLIRContext *context) @@ -36,12 +41,6 @@ mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder, return builder.create(loc, value); } -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT -#undef GET_OP_CLASSES - -#include "paddle/infrt/dialect/rewrite.hpp.inc" // NOLINT - void ConstantOp::build(OpBuilder &builder, OperationState &state, Attribute value) { @@ -66,8 +65,8 @@ LogicalResult ConstantOp::inferReturnTypes( inferredReturnTypes.push_back(attributes.get("value").getType()); return success(); } -::mlir::OpFoldResult ConstantOp::fold( - ::llvm::ArrayRef<::mlir::Attribute> operands) { +mlir::OpFoldResult ConstantOp::fold( + ::llvm::ArrayRef operands) { return value(); } @@ -82,11 +81,11 @@ LogicalResult ElementwiseAdd::inferReturnTypes( return success(); } void ElementwiseAdd::getCanonicalizationPatterns( - ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) { + mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { results.insert(context); } -::mlir::OpFoldResult ElementwiseAdd::fold( +mlir::OpFoldResult ElementwiseAdd::fold( llvm::ArrayRef operands) { if (getElementTypeOrSelf(getType()).isa()) { if (!operands[0] || !operands[1]) return {}; @@ -154,17 +153,17 @@ LogicalResult MulOp::inferReturnTypes( } void ReluOp::getCanonicalizationPatterns( - ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) { + mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { results.insert(context); } void FusedRepeatedFCRelu::getCanonicalizationPatterns( - ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) { + mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { results.insert(context); } void BatchNormOp::getCanonicalizationPatterns( - ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) { + mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { results.insert(context); } diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h index 71e0a53988d1a..7d1d1d6f58451 100644 --- a/paddle/infrt/dialect/pd_ops.h +++ b/paddle/infrt/dialect/pd_ops.h @@ -14,21 +14,20 @@ #pragma once -#include "mlir/Dialect/Traits.h" -#include "mlir/IR/Attributes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Dialect.h" -#include "mlir/IR/Function.h" -#include "mlir/IR/Matchers.h" -#include "mlir/IR/Module.h" -#include "mlir/IR/OpImplementation.h" -#include "mlir/IR/StandardTypes.h" -#include "mlir/IR/TypeUtilities.h" -#include "mlir/Interfaces/CallInterfaces.h" -#include "mlir/Interfaces/DerivedAttributeOpInterface.h" -#include "mlir/Interfaces/InferTypeOpInterface.h" -#include "mlir/Interfaces/LoopLikeInterface.h" -#include "mlir/Interfaces/SideEffectInterfaces.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace mlir { namespace pd { @@ -53,9 +52,8 @@ class PaddleDialect : public Dialect { } }; -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_ops.hpp.inc" -#undef GET_OP_CLASSES - } // namespace pd } // namespace mlir + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd_ops.hpp.inc" diff --git a/paddle/infrt/dialect/pd_ops.td b/paddle/infrt/dialect/pd_ops.td index b020b7ad5dbc7..3addf15082a12 100644 --- a/paddle/infrt/dialect/pd_ops.td +++ b/paddle/infrt/dialect/pd_ops.td @@ -24,6 +24,16 @@ def PD_FeedOp : PD_Op<"feed"> { def PD_FetchOp : PD_Op<"fetch", [Terminator]> { let summary = "fetch Op"; + let description = [{ + Return the output tensor from the subgraph. + }]; + + let arguments = (ins PD_Tensor :$inputs, StrAttr:$name); +} + +def PD_ReturnOp : PD_Op<"return", [Terminator]> { + let summary = "return Op"; + let description = [{ Fetch tensor from the graph. }]; @@ -31,7 +41,7 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> { let arguments = (ins Variadic:$inputs); } -def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> { +def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> { let summary = "paddle graph Op"; let description = [{ Describe a paddle graph or subgraph. @@ -50,7 +60,7 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte let hasFolder = 1; let builders = [ - OpBuilder<"OpBuilder &builder, OperationState &state, Attribute value">, + OpBuilder<(ins "Attribute":$value)>, ]; } diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h index 6f9fe56338a9f..0da888a9c0769 100644 --- a/paddle/infrt/dialect/pd_types.h +++ b/paddle/infrt/dialect/pd_types.h @@ -18,12 +18,11 @@ #pragma once -#include "mlir/IR/Diagnostics.h" -#include "mlir/IR/Location.h" -#include "mlir/IR/Operation.h" -#include "mlir/IR/StandardTypes.h" -#include "mlir/IR/TypeUtilities.h" -#include "mlir/IR/Types.h" +#include +#include +#include +#include +#include namespace mlir { namespace PD { diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc index 43a3577b90f10..5cfd16ee85943 100644 --- a/paddle/infrt/dialect/print_ir.cc +++ b/paddle/infrt/dialect/print_ir.cc @@ -11,26 +11,25 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include "llvm/ADT/Optional.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ScopedPrinter.h" -#include "llvm/Support/raw_os_ostream.h" -#include "llvm/Support/raw_ostream.h" -#include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/IR/AsmState.h" -#include "mlir/IR/Block.h" -#include "mlir/IR/MLIRContext.h" -#include "mlir/IR/Module.h" -#include "mlir/IR/Operation.h" -#include "mlir/IR/Region.h" -#include "mlir/IR/Verifier.h" -#include "mlir/Parser.h" -#include "mlir/Pass/PassManager.h" -#include "mlir/Support/LogicalResult.h" -#include "mlir/Transforms/Passes.h" #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/init_infrt_dialects.h" @@ -114,17 +113,15 @@ int main(int argc, char **argv) { mlir::registerPassManagerCLOptions(); cl::ParseCommandLineOptions(argc, argv, "mlir demo"); - mlir::MLIRContext *context = infrt::Global::getMLIRContext(); - // context->allowUnregisteredDialects(); - auto ®istry = context->getDialectRegistry(); - infrt::RegisterCinnDialects(registry); - + mlir::DialectRegistry registry; + infrt::registerCinnDialects(registry); + mlir::MLIRContext context(registry); // mlir will verify module automatically after parsing. // https://github.com/llvm/llvm-project/blob/38d18d93534d290d045bbbfa86337e70f1139dc2/mlir/lib/Parser/Parser.cpp#L2051 // mlir::OwningModuleRef module_ref = mlir::parseSourceString(mlir_source, // context); mlir::OwningModuleRef module_ref = - mlir::parseSourceFile(inputFilename, context); + mlir::parseSourceFile(inputFilename, &context); std::cout << "----------print IR Structure begin----------" << std::endl; printOperation(module_ref->getOperation(), 0); std::cout << "----------print IR Structure end----------" << std::endl; diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc index ef5a5525cb22f..92c03818264ee 100644 --- a/paddle/infrt/dialect/tensor_shape.cc +++ b/paddle/infrt/dialect/tensor_shape.cc @@ -17,16 +17,16 @@ #include #include #include +#include +#include #include -#include -#include #include #include -#include #include #include -namespace infrt::ts { +namespace infrt { +namespace ts { using namespace mlir; // NOLINT void TensorShapeDialect::initialize() { @@ -48,8 +48,8 @@ Type TensorShapeDialect::parseType(DialectAsmParser &parser) const { return Type(); } -void TensorShapeDialect::printType(::mlir::Type type, - ::mlir::DialectAsmPrinter &os) const { +void TensorShapeDialect::printType(mlir::Type type, + mlir::DialectAsmPrinter &os) const { if (type.isa()) { os << "shape"; return; @@ -61,8 +61,10 @@ void TensorShapeDialect::printType(::mlir::Type type, } llvm_unreachable("unexpected 'shape' type kind"); } +} // namespace ts +} // namespace infrt #define GET_OP_CLASSES #include "paddle/infrt/dialect/tensor_shape.cpp.inc" // NOLINT -} // namespace infrt::ts +#include "paddle/infrt/dialect/tensor_shape_dialect.cpp.inc" diff --git a/paddle/infrt/dialect/tensor_shape.h b/paddle/infrt/dialect/tensor_shape.h index bd3fa8853675a..af892af735d2a 100644 --- a/paddle/infrt/dialect/tensor_shape.h +++ b/paddle/infrt/dialect/tensor_shape.h @@ -17,7 +17,8 @@ #include #include -namespace infrt::ts { +namespace infrt { +namespace ts { class ShapeType : public mlir::Type::TypeBase { @@ -31,10 +32,9 @@ class PartialShapeType : public mlir::Type::TypeBase()">, "!ts.shape type">, BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> { - let typeDescription = [{ + let description = [{ `!ts.shape type` represents a static tensor shape. }]; } @@ -27,7 +27,7 @@ BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> { def TS_PartialShape : DialectType()">, "!ts.partial_shape type">, BuildableType<"$_builder.getType<::infrt::ts::PartialShapeType>()"> { - let typeDescription = [{ + let description = [{ `!ts.partial_shape type` represents either a static tensor shape, unranked tensor shape or a ranked tensor shape with unknown dimension sizes. }]; diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index dc0f2acb2b733..1baef7a3f77fd 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -11,10 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include #include -#include "llvm/Support/CommandLine.h" -#include "mlir/Pass/PassManager.h" #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h" diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index 181f462962aee..1da80ef2c3b10 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -14,14 +14,13 @@ #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h" +#include +#include +#include +#include #include #include #include -#include "llvm/ADT/SetVector.h" -#include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/IR/Builders.h" -#include "paddle/infrt/dialect/pd_ops.h" -#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { namespace trt { @@ -32,9 +31,9 @@ namespace { // Reference the function nameed "FlexibleDFS" but defined in: // paddle/fluid/framework/ir/subgraph_detector.cc. -bool reverseDfs(std::vector<::mlir::Operation *> source, - const std::function &func) { - std::unordered_set visited; +bool reverseDfs(std::vector source, + const std::function &func) { + std::unordered_set visited; while (!source.empty()) { auto node = source.back(); source.pop_back(); @@ -44,7 +43,7 @@ bool reverseDfs(std::vector<::mlir::Operation *> source, auto values = node->getOperands(); for (auto value : values) { // if the value is a block argument, the node is nullptr. - ::mlir::Operation *node = value.getDefiningOp(); + mlir::Operation *node = value.getDefiningOp(); if (node != nullptr && !visited.count(node)) { source.emplace_back(node); } @@ -54,19 +53,19 @@ bool reverseDfs(std::vector<::mlir::Operation *> source, } // merge the first&second graph op to a new graph op. -void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder, // NOLINT - ::mlir::pd::GraphOp first, - ::mlir::pd::GraphOp second) { +void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT + mlir::pd::GraphOp first, + mlir::pd::GraphOp second) { // comput inputs and outputs - ::llvm::SmallVector<::mlir::Value, 4> inputs(first.getOperands()), outputs; - for (::mlir::Value input : second.getOperands()) { + ::llvm::SmallVector inputs(first.getOperands()), outputs; + for (mlir::Value input : second.getOperands()) { if (input.getDefiningOp() != first) { inputs.push_back(input); } } - ::llvm::DenseMap<::mlir::Value, unsigned int> op_output_mapping; - for (::mlir::Value output : first.getResults()) { - for (::mlir::Operation *user : output.getUsers()) { + ::llvm::DenseMap op_output_mapping; + for (mlir::Value output : first.getResults()) { + for (mlir::Operation *user : output.getUsers()) { if (user != second && user->getParentOp() != second) { op_output_mapping[output] = outputs.size(); outputs.push_back(output); @@ -74,19 +73,19 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder, // NOLINT } } } - auto fetch_op = second.getBody()->getTerminator(); - outputs.append(fetch_op->getOperands().begin(), - fetch_op->getOperands().end()); - ::llvm::SmallVector<::mlir::Type, 4> fetch_types; + auto return_op = second.getBody()->getTerminator(); + outputs.append(return_op->getOperands().begin(), + return_op->getOperands().end()); + ::llvm::SmallVector return_types; for (auto value : outputs) { - fetch_types.push_back(value.getType()); + return_types.push_back(value.getType()); } // create the new graph op builder.setInsertionPoint(first); auto loc = first.getLoc(); - auto graph_op = builder.create<::mlir::pd::GraphOp>(loc, fetch_types, inputs); - ::mlir::Block *block = new ::mlir::Block; + auto graph_op = builder.create(loc, return_types, inputs); + mlir::Block *block = new mlir::Block; auto copy_range = second.getBody()->without_terminator(); block->getOperations().splice(block->begin(), second.getBody()->getOperations(), @@ -98,18 +97,18 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder, // NOLINT copy_range.begin(), copy_range.end()); builder.setInsertionPointToEnd(block); - builder.create(loc, outputs); + builder.create(loc, outputs); graph_op.body().push_back(block); // mapping the output unsigned int num_result = first.getNumResults(); - fetch_op = first.getBody()->getTerminator(); + return_op = first.getBody()->getTerminator(); for (unsigned int index = 0; index < num_result; ++index) { auto origin_value = first.getResult(index); if (op_output_mapping.find(origin_value) == op_output_mapping.end()) { - origin_value.replaceAllUsesWith(fetch_op->getOperand(index)); + origin_value.replaceAllUsesWith(return_op->getOperand(index)); } else { - auto inner_value = fetch_op->getOperand(index); + auto inner_value = return_op->getOperand(index); auto outer_value = graph_op.getResult(op_output_mapping[origin_value]); while (!origin_value.use_empty()) { auto replace_value = @@ -128,13 +127,13 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder, // NOLINT // Topological sort the function op. void topoSortBlock(mlir::Block &body) { // NOLINT - llvm::SetVector toSort; + llvm::SetVector toSort; if (body.empty()) return; for (auto it = body.rbegin(); it != body.rend(); ++it) { toSort.insert(&*it); } - llvm::SetVector result = - ::mlir::topologicalSort(std::move(toSort)); + llvm::SetVector result = + mlir::topologicalSort(std::move(toSort)); for (auto *op : result) { op->moveBefore(body.getTerminator()); } @@ -145,21 +144,21 @@ void topoSortBlock(mlir::Block &body) { // NOLINT // Implementation of the trtGraphFusePass. void trtGraphFusePass::runOnFunction() { mlir::Block &body = getFunction().front(); - ::mlir::OpBuilder builder(&body, body.begin()); + mlir::OpBuilder builder(&body, body.begin()); bool changed = false; do { changed = false; for (auto &op : body) { - ::mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op); + mlir::pd::GraphOp graph_op = + ::llvm::dyn_cast_or_null(&op); if (nullptr == graph_op) continue; for (auto user_op : op.getUsers()) { - ::mlir::pd::GraphOp user_graph_op = - ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(user_op); + mlir::pd::GraphOp user_graph_op = + ::llvm::dyn_cast_or_null(user_op); if (nullptr == user_graph_op) continue; // get all dst input nodes except src. - std::vector<::mlir::Operation *> source_nodes; + std::vector source_nodes; for (auto operand : user_op->getOperands()) { auto input = operand.getDefiningOp(); if (input != &op && input != nullptr) { @@ -167,9 +166,8 @@ void trtGraphFusePass::runOnFunction() { } } // Reverse DFS from the source_nodes. - if (!reverseDfs(source_nodes, [&op](const ::mlir::Operation *n) { - return n == &op; - })) { + if (!reverseDfs(source_nodes, + [&op](const mlir::Operation *n) { return n == &op; })) { mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op); changed = true; break; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h index e7134e88f316c..f1e555c6f67ec 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "mlir/Pass/Pass.h" +#include namespace infrt { namespace trt { @@ -28,15 +28,15 @@ namespace trt { * %a = "pd.feed"()... * %c = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "pd.fetch" %m + * "pd.return" %m * } ... * %d = "pd.graph"(%c) { * %m = "pd.conv3d"(%c)... - * "pd.fetch" %m + * "pd.return" %m * } ... * %f = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "pd.fetch" %m + * "pd.return" %m * } ... * "pd.fetch" %d, %f * @@ -47,13 +47,13 @@ namespace trt { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "pd.fetch" %n, %s + * "pd.return" %n, %s * } ... * "pd.fetch" %d, %f * } */ class trtGraphFusePass - : public ::mlir::PassWrapper { + : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "trtGraphFusePass"; } void runOnFunction() override; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index 2b45364de2036..257f2b5285425 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -14,7 +14,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" -#include "mlir/IR/Builders.h" +#include #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/tensorrt/trt_ops.h" @@ -22,24 +22,24 @@ namespace infrt { namespace trt { // Implementation of the trtGraphSplitPass。 void trtGraphSplitPass::runOnFunction() { - std::vector<::mlir::pd::GraphOp> worklist; - ::mlir::Block& block = getFunction().front(); + std::vector worklist; + mlir::Block& block = getFunction().front(); for (auto& op : block) { - ::mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op); + mlir::pd::GraphOp graph_op = + ::llvm::dyn_cast_or_null(&op); if (nullptr != graph_op && graph_op.getBody()->getOperations().size() <= min_subgraph_size_) { worklist.push_back(graph_op); } } while (!worklist.empty()) { - ::mlir::pd::GraphOp graph_op = worklist.back(); + mlir::pd::GraphOp graph_op = worklist.back(); worklist.pop_back(); - ::mlir::Block* body = graph_op.getBody(); - auto fetch_op = body->getTerminator(); - graph_op.replaceAllUsesWith(fetch_op->getOperands()); + mlir::Block* body = graph_op.getBody(); + auto return_op = body->getTerminator(); + graph_op.replaceAllUsesWith(return_op->getOperands()); auto copy_range = body->without_terminator(); - block.getOperations().splice(::mlir::Block::iterator(graph_op), + block.getOperations().splice(mlir::Block::iterator(graph_op), body->getOperations(), copy_range.begin(), copy_range.end()); diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h index 092df0cf834e5..d30d186647fc3 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "mlir/Pass/Pass.h" +#include namespace infrt { namespace trt { @@ -31,9 +31,9 @@ namespace trt { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "pd.fetch" %n, %s + * "pd.return" (%n, %s) * } ... - * "pd.fetch" %d, %f + * "pd.fetch" (%d, %f) * } * * destination func: @@ -42,11 +42,11 @@ namespace trt { * %c = "pd.conv2d"(%a) ... * %d = "pd.conv3d"(%c) ... * %f = "pd.conv2d"(%a) ... - * "pd.fetch" %d, %f + * "pd.fetch" (%d, %f) * } */ class trtGraphSplitPass - : public ::mlir::PassWrapper { + : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; } void runOnFunction() override; diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 7b7fbb05c1d13..4e8d40b982b2e 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -14,49 +14,48 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" -#include "mlir/IR/Builders.h" +#include #include "paddle/infrt/dialect/pd_ops.h" -#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { namespace trt { // Implementation of the trtOpTellerPass。 void trtOpTellerPass::runOnFunction() { - ::mlir::Block &body = getFunction().front(); - std::vector<::mlir::Operation *> worklist; + mlir::Block &body = getFunction().front(); + std::vector worklist; worklist.reserve(body.getOperations().size()); for (auto &op : body) { worklist.push_back(&op); } // Build GraphOp. - ::mlir::OpBuilder builder(&body, body.begin()); + mlir::OpBuilder builder(&body, body.begin()); while (!worklist.empty()) { auto *op = worklist.back(); worklist.pop_back(); if (op == nullptr) continue; - auto op1 = ::llvm::dyn_cast_or_null<::mlir::pd::FeedOp>(op); + auto op1 = ::llvm::dyn_cast_or_null(op); if (op1) continue; - auto op2 = ::llvm::dyn_cast_or_null<::mlir::pd::FetchOp>(op); + auto op2 = ::llvm::dyn_cast_or_null(op); if (op2) continue; - auto op3 = ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(op); + auto op3 = ::llvm::dyn_cast_or_null(op); if (op3) continue; builder.setInsertionPoint(op); auto loc = getFunction().getLoc(); - auto graph_op = builder.create<::mlir::pd::GraphOp>( + auto graph_op = builder.create( loc, op->getResultTypes(), op->getOperands()); - ::llvm::SmallVector<::mlir::Value, 4> tblgen_repl_values; + ::llvm::SmallVector tblgen_repl_values; for (auto v : - ::llvm::SmallVector<::mlir::Value, 4>{graph_op.getODSResults(0)}) { + ::llvm::SmallVector{graph_op.getODSResults(0)}) { tblgen_repl_values.push_back(v); } op->replaceAllUsesWith(tblgen_repl_values); // Build graph op. - ::mlir::Block *block = new ::mlir::Block; + mlir::Block *block = new mlir::Block; graph_op.body().push_back(block); op->moveBefore(block, block->begin()); builder.setInsertionPointToEnd(block); - builder.create(loc, op->getResults()); + builder.create(loc, op->getResults()); } } } // namespace trt diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h index b03945b3459c0..fb16c974f7fb3 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "mlir/Pass/Pass.h" +#include namespace infrt { namespace trt { @@ -29,7 +29,7 @@ namespace trt { * %c = "pd.conv2d"(%a) ... * %d = "pd.conv3d"(%c) ... * %f = "pd.conv2d"(%a) ... - * "pd.fetch" %d, %f + * "pd.fetch" (%d, %f) * } * * destination func: @@ -37,23 +37,23 @@ namespace trt { * %a = "pd.feed"()... * %c = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "pd.fetch" %m + * "pd.return" (%m) * } ... * %d = "pd.graph"(%c) { * %m = "pd.conv3d"(%c)... - * "pd.fetch" %m + * "pd.return" (%m) * } ... * %f = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "pd.fetch" %m + * "pd.return" (%m) * } ... - * "pd.fetch" %d, %f + * "pd.fetch" (%d, %f) * } * TODO(winter-wang): Supplementary how to judge the operators can be supported * by tensorrt. */ class trtOpTellerPass - : public ::mlir::PassWrapper { + : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "trtOpTellerPass"; } void runOnFunction() override; diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc index 4c02238b10e1d..35b7967892caf 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.cc +++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc @@ -13,27 +13,25 @@ // limitations under the License. #include "paddle/infrt/dialect/tensorrt/trt_ops.h" -#include "mlir/IR/Matchers.h" -#include "mlir/IR/OpImplementation.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/Interfaces/CallInterfaces.h" -#include "mlir/Interfaces/SideEffectInterfaces.h" +#include +#include +#include +#include +#include namespace infrt { namespace trt { -TensorRTDialect::TensorRTDialect(::mlir::MLIRContext *context) - : ::mlir::Dialect("trt", context, ::mlir::TypeID::get()) { +TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context) + : mlir::Dialect("trt", context, mlir::TypeID::get()) { addOperations< #define GET_OP_LIST #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc" // NOLINT >(); -#undef GET_OP_LIST } -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc" // NOLINT -#undef GET_OP_CLASSES - } // namespace trt } // namespace infrt + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index c9043c2280de0..a37491ec1abc7 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -14,37 +14,32 @@ #pragma once -#include "mlir/Dialect/Traits.h" -#include "mlir/IR/Attributes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Dialect.h" -#include "mlir/IR/Function.h" -#include "mlir/IR/Matchers.h" -#include "mlir/IR/Module.h" -#include "mlir/IR/OpImplementation.h" -#include "mlir/IR/StandardTypes.h" -#include "mlir/IR/TypeUtilities.h" -#include "mlir/Interfaces/CallInterfaces.h" -#include "mlir/Interfaces/DerivedAttributeOpInterface.h" -#include "mlir/Interfaces/InferTypeOpInterface.h" -#include "mlir/Interfaces/LoopLikeInterface.h" -#include "mlir/Interfaces/SideEffectInterfaces.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace infrt { namespace trt { -class TensorRTDialect : public ::mlir::Dialect { +class TensorRTDialect : public mlir::Dialect { public: - explicit TensorRTDialect(::mlir::MLIRContext* context); + explicit TensorRTDialect(mlir::MLIRContext* context); static llvm::StringRef getDialectNamespace() { return "trt"; } }; -// mlir bug。 can be removed safety when update mlir to llvm11. -using namespace mlir; // NOLINT +} // namespace trt +} // namespace infrt #define GET_OP_CLASSES #include "paddle/infrt/dialect/tensorrt/trt_ops.hpp.inc" -#undef GET_OP_CLASSES - -} // namespace trt -} // namespace infrt diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc index 894d96f95ad5c..c4588d7cf8bab 100644 --- a/paddle/infrt/dialect/test_kernels.cc +++ b/paddle/infrt/dialect/test_kernels.cc @@ -14,14 +14,13 @@ #include "paddle/infrt/dialect/test_kernels.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/OpDefinition.h" -#include "mlir/IR/OpImplementation.h" -#include "mlir/IR/StandardTypes.h" -#include "mlir/IR/TypeUtilities.h" - -namespace infrt::dialect { +#include +#include +#include +#include +namespace infrt { +namespace dialect { //===----------------------------------------------------------------------===// // BenchmarkOp //===----------------------------------------------------------------------===// @@ -32,65 +31,67 @@ namespace infrt::dialect { // ... // } -static ParseResult parseBenchmarkOp(OpAsmParser &parser, // NOLINT - OperationState &result) { // NOLINT - StringAttr nameAttr; +static mlir::ParseResult parseBenchmarkOp( + mlir::OpAsmParser &parser, // NOLINT + mlir::OperationState &result) { // NOLINT + mlir::StringAttr nameAttr; if (parser.parseAttribute(nameAttr, "name", result.attributes)) - return failure(); + return mlir::failure(); // Parse the operands, e.g. (%c : i32, %d : f32) - if (parser.parseLParen()) return failure(); + if (parser.parseLParen()) return mlir::failure(); - SmallVector operands; - SmallVector types; + llvm::SmallVector operands; + llvm::SmallVector types; llvm::SMLoc type_loc = parser.getCurrentLocation(); if (parser.parseOptionalRParen()) { // Parse non-empty operands do { // Parse %c : i32, - OpAsmParser::OperandType operand; - Type type; + mlir::OpAsmParser::OperandType operand; + mlir::Type type; if (parser.parseOperand(operand) || parser.parseColonType(type)) - return failure(); + return mlir::failure(); operands.push_back(operand); types.push_back(type); } while (succeeded(parser.parseOptionalComma())); - if (parser.parseRParen()) return failure(); + if (parser.parseRParen()) return mlir::failure(); } if (parser.resolveOperands(operands, types, type_loc, result.operands)) - return failure(); + return mlir::failure(); // Parse the keyword attribute, e.g. max_count = 100, duration_secs = 1 do { - StringRef attr; - Attribute resultAttr; + mlir::StringRef attr; + mlir::Attribute resultAttr; if (parser.parseKeyword(&attr) || parser.parseEqual() || parser.parseAttribute(resultAttr, parser.getBuilder().getIntegerType(32), attr, result.attributes)) - return failure(); - } while (succeeded(parser.parseOptionalComma())); + return mlir::failure(); + } while (mlir::succeeded(parser.parseOptionalComma())); // Set the default attribute num_warmup_runs to 1 if unset auto setDefaultAttrIfUnset = [&](const char *attr_name, int value) { bool found = llvm::any_of(result.attributes, - [attr_name](const NamedAttribute &attr) { - return attr.first == attr_name; + [attr_name](const mlir::NamedAttribute &attr) { + return attr.getName() == attr_name; }); if (!found) { - IntegerAttr default_val = parser.getBuilder().getI32IntegerAttr(value); + mlir::IntegerAttr default_val = + parser.getBuilder().getI32IntegerAttr(value); result.addAttribute(attr_name, default_val); } }; setDefaultAttrIfUnset("num_warmup_runs", 1); - Region *target = result.addRegion(); + mlir::Region *target = result.addRegion(); return parser.parseRegion(*target, operands, types, @@ -102,11 +103,11 @@ static ParseResult parseBenchmarkOp(OpAsmParser &parser, // NOLINT // max_count = 100, duration_secs = 1 { // ... // } -static void print(OpAsmPrinter &p, BenchmarkOp op) { // NOLINT +static void print(mlir::OpAsmPrinter &p, BenchmarkOp op) { // NOLINT p << "infrt.benchmark "; // Print the name attribute, e.g "add.i32" - auto name_attr = op.getAttr("name"); + auto name_attr = op->getAttr("name"); p << name_attr; // Print the operands and types, e.g. (%c : i32, %d : f32) @@ -120,13 +121,13 @@ static void print(OpAsmPrinter &p, BenchmarkOp op) { // NOLINT bool need_comma = false; // Print the attributes, e.g. max_count = 100, duration_secs = 1 - for (auto &name_attr : op.getAttrs()) { - auto id = name_attr.first; + for (auto &name_attr : op->getAttrs()) { + auto id = name_attr.getName(); if (id == "name") continue; if (need_comma) p << ", "; - auto attr = name_attr.second; + auto attr = name_attr.getValue(); p << id << " = "; - if (auto int_attr = attr.dyn_cast()) { + if (auto int_attr = attr.dyn_cast()) { int_attr.getValue().print(p.getStream(), /*isSigned=*/false); } else { op.emitOpError("Unexpected attribute"); @@ -142,7 +143,7 @@ static void print(OpAsmPrinter &p, BenchmarkOp op) { // NOLINT p.printRegion(op.region(), /*printEntryBlockArgs=*/false); } -static LogicalResult verify(BenchmarkOp op) { +static mlir::LogicalResult verify(BenchmarkOp op) { // Verify that the target benchmark region has exactly one return value. auto ®ion = op.region(); auto &last_op = region.front().back(); @@ -154,10 +155,10 @@ static LogicalResult verify(BenchmarkOp op) { "incorrect number of return values. One return value is expected"); } - return success(); + return mlir::success(); } +} // namespace dialect +} // namespace infrt #define GET_OP_CLASSES #include "paddle/infrt/dialect/test_kernels.cpp.inc" - -} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/test_kernels.h index 29d4209cb7280..73c8a6fb387bc 100644 --- a/paddle/infrt/dialect/test_kernels.h +++ b/paddle/infrt/dialect/test_kernels.h @@ -13,11 +13,8 @@ // limitations under the License. #pragma once -#include "mlir/IR/OpDefinition.h" -#include "mlir/Interfaces/SideEffectInterfaces.h" +#include +#include -namespace infrt::dialect { -using namespace mlir; // NOLINT #define GET_OP_CLASSES #include "paddle/infrt/dialect/test_kernels.hpp.inc" -} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/types.cc b/paddle/infrt/dialect/types.cc deleted file mode 100644 index 6d6f6a20b46c9..0000000000000 --- a/paddle/infrt/dialect/types.cc +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/infrt/dialect/types.h" - -namespace infrt::hlir::mlir {} // namespace infrt::hlir::mlir diff --git a/paddle/infrt/dialect/types.h b/paddle/infrt/dialect/types.h deleted file mode 100644 index a9a2b61871cc0..0000000000000 --- a/paddle/infrt/dialect/types.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc index cdb8cc99ecb26..e3917bd07d242 100644 --- a/paddle/infrt/host_context/core_runtime.cc +++ b/paddle/infrt/host_context/core_runtime.cc @@ -23,7 +23,8 @@ #include "paddle/infrt/host_context/op_executable.h" #include "paddle/infrt/host_context/symbol_table.h" -namespace infrt::host_context { +namespace infrt { +namespace host_context { struct CoreRuntime::Impl { KernelRegistry* kernel_registry{}; @@ -90,4 +91,5 @@ llvm::SmallVector CoreRuntime::GetResults( CoreRuntime::~CoreRuntime() {} -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h index 802f8b17bb010..acb6a66cac630 100644 --- a/paddle/infrt/host_context/core_runtime.h +++ b/paddle/infrt/host_context/core_runtime.h @@ -22,7 +22,8 @@ #include "paddle/infrt/host_context/value.h" -namespace infrt::host_context { +namespace infrt { +namespace host_context { class KernelRegistry; class OpExecutable; @@ -83,4 +84,5 @@ class CoreRuntimeBuilder : public CoreRuntime { OpExecutableBuilder* NewOpExecutable(const std::string& op_name); }; -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h index 20cb17dc7fbe2..5186b88fe2c41 100644 --- a/paddle/infrt/host_context/kernel_frame.h +++ b/paddle/infrt/host_context/kernel_frame.h @@ -21,7 +21,8 @@ #include "llvm/ADT/SmallVector.h" #include "paddle/infrt/host_context/value.h" -namespace infrt::host_context { +namespace infrt { +namespace host_context { /** * KernelFrame captures the states(input arguments, attributes, results) @@ -163,4 +164,5 @@ class KernelFrameBuilder : public KernelFrame { } }; -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/kernel_registry_test.cc b/paddle/infrt/host_context/kernel_registry_test.cc index f36ec2a1cac7d..7fca56343041c 100644 --- a/paddle/infrt/host_context/kernel_registry_test.cc +++ b/paddle/infrt/host_context/kernel_registry_test.cc @@ -18,7 +18,8 @@ #include "paddle/infrt/host_context/kernel_utils.h" -namespace infrt::host_context { +namespace infrt { +namespace host_context { int add_i32(int a, int b) { return a + b; } @@ -44,4 +45,5 @@ TEST(KernelRegistry, basic) { ASSERT_EQ(results[0]->get(), 3); } -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc index 1904eb106a293..bebd8d86e50bb 100644 --- a/paddle/infrt/host_context/kernel_utils_test.cc +++ b/paddle/infrt/host_context/kernel_utils_test.cc @@ -16,7 +16,8 @@ #include -namespace infrt::host_context { +namespace infrt { +namespace host_context { int add_i32(int a, int b) { return a + b; } float add_f32(float a, float b) { return a + b; } @@ -66,4 +67,5 @@ TEST(KernelImpl, pair) { ASSERT_EQ(results[1]->get(), 3.f); } -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc index 5f8dacf8e448a..47ec27ebec300 100644 --- a/paddle/infrt/host_context/mlir_function_executable.cc +++ b/paddle/infrt/host_context/mlir_function_executable.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/host_context/mlir_function_executable.h" #include +#include #include // NOLINT diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h index ba5fa154d6fcc..a6428df86e6b2 100644 --- a/paddle/infrt/host_context/mlir_function_executable.h +++ b/paddle/infrt/host_context/mlir_function_executable.h @@ -13,7 +13,8 @@ // limitations under the License. #pragma once -#include +#include +#include #include #include diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h index b2af4d2d79db5..c2ccb90640b21 100644 --- a/paddle/infrt/host_context/mlir_program_executor.h +++ b/paddle/infrt/host_context/mlir_program_executor.h @@ -15,9 +15,9 @@ #pragma once #include +#include +#include #include -#include -#include #include #include diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index 25324b1291582..3dbc7a702be38 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -16,8 +16,9 @@ #include #include +#include +#include #include -#include #include #include @@ -40,7 +41,8 @@ #include "paddle/infrt/host_context/value.h" #include "paddle/infrt/tensor/tensor_shape.h" -namespace infrt::host_context { +namespace infrt { +namespace host_context { template std::string DumpToString(T& op) { // NOLINT @@ -113,10 +115,10 @@ bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) { template <> boost::optional MlirToRuntimeTranslator::EmitAttribute( - const mlir::Attribute* attr) { - if (!attr->isa()) return boost::none; - if (attr->isa()) { - auto val = attr->cast(); + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + if (attr.isa()) { + auto val = attr.cast(); if (val.getType().isInteger(32)) { return val.getInt(); } @@ -125,10 +127,10 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( } template <> boost::optional MlirToRuntimeTranslator::EmitAttribute( - const mlir::Attribute* attr) { - if (!attr->isa()) return boost::none; - if (attr->isa()) { - auto val = attr->cast(); + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + if (attr.isa()) { + auto val = attr.cast(); if (val.getType().isInteger(64)) { return val.getInt(); } @@ -139,10 +141,10 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( // TODO(Superjomn) Make double and float parsing share some thing. template <> boost::optional MlirToRuntimeTranslator::EmitAttribute( - const mlir::Attribute* attr) { - if (!attr->isa()) return boost::none; - if (attr->isa()) { - auto val = attr->cast(); + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + if (attr.isa()) { + auto val = attr.cast(); if (val.getType().isF32()) return val.getValueAsDouble(); } return boost::none; @@ -150,10 +152,10 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( template <> boost::optional MlirToRuntimeTranslator::EmitAttribute( - const mlir::Attribute* attr) { - if (!attr->isa()) return boost::none; - if (attr->isa()) { - auto val = attr->cast(); + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + if (attr.isa()) { + auto val = attr.cast(); if (val.getType().isF64()) return val.getValueAsDouble(); } return boost::none; @@ -161,17 +163,17 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( template <> boost::optional MlirToRuntimeTranslator::EmitAttribute( - const mlir::Attribute* attr) { - if (!attr->isa()) return boost::none; - return attr->cast().getValue().str(); + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + return attr.cast().getValue().str(); } #define PROCESS_ARRAY_INT(type__, bits__) \ template <> \ boost::optional> MlirToRuntimeTranslator::EmitAttribute( \ - const mlir::Attribute* attr) { \ - if (!attr->isa()) return boost::none; \ - auto array = attr->cast(); \ + const mlir::Attribute& attr) { \ + if (!attr.isa()) return boost::none; \ + auto array = attr.cast(); \ CHECK(!array.empty()); \ \ if (!array[0].getType().isInteger(bits__)) { \ @@ -191,9 +193,9 @@ PROCESS_ARRAY_INT(int64_t, 64); template <> boost::optional> MlirToRuntimeTranslator::EmitAttribute( - const mlir::Attribute* attr) { - if (!attr->isa()) return boost::none; - auto array = attr->cast(); + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + auto array = attr.cast(); CHECK(!array.empty()); if (!array[0].getType().isF32()) return boost::none; @@ -207,9 +209,9 @@ boost::optional> MlirToRuntimeTranslator::EmitAttribute( template <> boost::optional> MlirToRuntimeTranslator::EmitAttribute( - const mlir::Attribute* attr) { - if (!attr->isa()) return boost::none; - auto array = attr->cast(); + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + auto array = attr.cast(); CHECK(!array.empty()); if (!array[0].getType().isF64()) return boost::none; @@ -236,7 +238,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { for (int i = 0, e = op->getNumOperands(); i < e; i++) { // function argument as value auto operand = op->getOperand(i); - if (operand.getKind() == mlir::Value::Kind::BlockArgument) { + /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) { + if (operand.isa()) { mlir::BlockArgument arg = operand.dyn_cast(); Value* arg_value = GetValue(arg); impl_->cur_op->AppendArgument(arg_value); @@ -283,25 +286,25 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { for (size_t i = 0; i < attrs.size(); i++) { auto& attr = attrs[i]; - if (auto v = EmitAttribute(&attr.second)) { + if (auto v = EmitAttribute(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(*v)); - } else if (auto v = EmitAttribute(&attr.second)) { + } else if (auto v = EmitAttribute(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(*v)); - } else if (auto v = EmitAttribute(&attr.second)) { + } else if (auto v = EmitAttribute(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(*v)); - } else if (auto v = EmitAttribute(&attr.second)) { + } else if (auto v = EmitAttribute(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(*v)); - } else if (auto v = EmitAttribute(&attr.second)) { + } else if (auto v = EmitAttribute(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); - } else if (auto v = EmitAttribute>(&attr.second)) { + } else if (auto v = EmitAttribute>(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); - } else if (auto v = EmitAttribute>(&attr.second)) { + } else if (auto v = EmitAttribute>(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); - } else if (auto v = EmitAttribute>(&attr.second)) { + } else if (auto v = EmitAttribute>(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); - } else if (auto v = EmitAttribute>(&attr.second)) { + } else if (auto v = EmitAttribute>(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); - } else if (auto v = EmitAttribute>(&attr.second)) { + } else if (auto v = EmitAttribute>(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); } else { LOG(FATAL) << "Not supported attribute type"; @@ -330,7 +333,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { llvm::SmallVector results; auto func_type = - mlir::FunctionType::get(inputs, results, region.getContext()); + mlir::FunctionType::get(region.getContext(), inputs, results); auto* function = impl_->cur_op->CreateFunctionExecutable( ®ion, func_type, &impl_->func_defs); impl_->cur_op->AppendAttribute(new Value(function)); @@ -555,4 +558,5 @@ void TestMlir(mlir::ModuleOp module, KernelRegistry* registry) { execute.Run(); } -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h index 598e81bfd96d8..fcd79eaf386ee 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.h +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h @@ -29,7 +29,8 @@ class Attribute; class Value; } // namespace mlir -namespace infrt::host_context { +namespace infrt { +namespace host_context { class CoreRuntimeBuilder; class Value; @@ -73,7 +74,7 @@ class MlirToRuntimeTranslator { bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table); template - boost::optional EmitAttribute(const mlir::Attribute* attr); + boost::optional EmitAttribute(const mlir::Attribute& attr); Value* GetOpResult(mlir::Operation* op); @@ -104,4 +105,5 @@ void MlirToRuntimeTranslate(mlir::ModuleOp module, CoreRuntimeBuilder* runtime); */ void TestMlir(mlir::ModuleOp module, KernelRegistry* registry); -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc index 9b85be977ab6c..375daa4515e17 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc @@ -29,7 +29,8 @@ #include "paddle/infrt/kernel/tensor_shape_kernels.h" #include "paddle/infrt/kernel/test_kernels.h" -namespace infrt::host_context { +namespace infrt { +namespace host_context { TEST(MlirToRuntimeTranslate, basic) { mlir::MLIRContext context; @@ -48,7 +49,7 @@ func @main() -> () { )ROC"; auto module = dialect::LoadMlirSource(&context, source); - module->verify(); + EXPECT_TRUE(mlir::succeeded(module->verify())); KernelRegistry registry; kernel::RegisterFloatBasicKernels(®istry); @@ -74,7 +75,7 @@ func @main() -> () { )ROC"; auto module = dialect::LoadMlirSource(&context, source); - module->verify(); + EXPECT_TRUE(mlir::succeeded(module->verify())); KernelRegistry registry; kernel::RegisterFloatBasicKernels(®istry); @@ -115,7 +116,7 @@ infrt.return %a0, %b0: !infrt.tensor, !infrt.tensorverify(); + EXPECT_TRUE(mlir::succeeded(module->verify())); host_context::KernelRegistry registry; @@ -157,4 +158,5 @@ infrt.return %a0, %b0: !infrt.tensor, !infrt.tensor #include #include "paddle/infrt/host_context/kernel_frame.h" @@ -21,7 +22,8 @@ #include "paddle/infrt/host_context/mlir_function_executable.h" #include "paddle/infrt/host_context/symbol_table.h" -namespace infrt::host_context { +namespace infrt { +namespace host_context { struct OpExecutable::Impl { Impl(const std::string& op_name, @@ -148,4 +150,5 @@ void OpExecutable::Execute() { OpExecutable::~OpExecutable() {} -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h index e2248225a5caf..550f6ab6349ed 100644 --- a/paddle/infrt/host_context/op_executable.h +++ b/paddle/infrt/host_context/op_executable.h @@ -14,19 +14,18 @@ #pragma once #include - +#include +#include #include #include #include -#include "mlir/IR/Function.h" -#include "mlir/IR/Region.h" - namespace mlir { class FuncOp; } // namespace mlir -namespace infrt::host_context { +namespace infrt { +namespace host_context { class SymbolTable; class KernelRegistry; @@ -89,4 +88,5 @@ class OpExecutableBuilder : public OpExecutable { function_defs_t* function_defs); }; -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc index d7f2c3865157d..b186cfcfd2b35 100644 --- a/paddle/infrt/kernel/basic_kernels.cc +++ b/paddle/infrt/kernel/basic_kernels.cc @@ -23,7 +23,8 @@ using infrt::host_context::Attribute; -namespace infrt::kernel { +namespace infrt { +namespace kernel { template T add(T a, T b) { @@ -82,4 +83,5 @@ void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) { registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print)); } -} // namespace infrt::kernel +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/basic_kernels.h b/paddle/infrt/kernel/basic_kernels.h index 9e98885cf6ebf..feb66be61f530 100644 --- a/paddle/infrt/kernel/basic_kernels.h +++ b/paddle/infrt/kernel/basic_kernels.h @@ -15,13 +15,16 @@ #pragma once #include -namespace infrt::host_context { +namespace infrt { +namespace host_context { struct KernelRegistry; -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt -namespace infrt::kernel { +namespace infrt { +namespace kernel { /** * Register all the basic kernels to \p registry. @@ -31,4 +34,5 @@ void RegisterBasicKernels(host_context::KernelRegistry* registry); void RegisterIntBasicKernels(host_context::KernelRegistry* registry); void RegisterFloatBasicKernels(host_context::KernelRegistry* registry); -} // namespace infrt::kernel +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index 2fa477aa4dbda..51e0004922374 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -25,7 +25,8 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/infrt/tensor/tensor_shape.h" -namespace infrt::kernel { +namespace infrt { +namespace kernel { using namespace host_context; // NOLINT using namespace tensor; // NOLINT @@ -76,4 +77,5 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { INFRT_KERNEL(ShallowCopyTensor)); } -} // namespace infrt::kernel +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensor_kernels.h b/paddle/infrt/kernel/tensor_kernels.h index 8f2180ba80a4f..df8e25c32393c 100644 --- a/paddle/infrt/kernel/tensor_kernels.h +++ b/paddle/infrt/kernel/tensor_kernels.h @@ -14,12 +14,16 @@ #pragma once -namespace infrt::host_context { +namespace infrt { +namespace host_context { struct KernelRegistry; -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt -namespace infrt::kernel { +namespace infrt { +namespace kernel { void RegisterTensorKernels(host_context::KernelRegistry* registry); -} // namespace infrt::kernel +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensor_shape_kernels.cc b/paddle/infrt/kernel/tensor_shape_kernels.cc index a04b492819298..4edbecfa10886 100644 --- a/paddle/infrt/kernel/tensor_shape_kernels.cc +++ b/paddle/infrt/kernel/tensor_shape_kernels.cc @@ -24,7 +24,8 @@ #include "paddle/infrt/host_context/kernel_utils.h" #include "paddle/infrt/tensor/tensor_shape.h" -namespace infrt::kernel { +namespace infrt { +namespace kernel { void PrintShape(const tensor::TensorShape& shape) { llvm::raw_os_ostream oos(std::cout); @@ -35,4 +36,5 @@ void RegisterTensorShapeKernels(host_context::KernelRegistry* registry) { registry->AddKernel("ts.print_shape", INFRT_KERNEL(PrintShape)); } -} // namespace infrt::kernel +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensor_shape_kernels.h b/paddle/infrt/kernel/tensor_shape_kernels.h index e87c6c37e88a0..e31a37463be43 100644 --- a/paddle/infrt/kernel/tensor_shape_kernels.h +++ b/paddle/infrt/kernel/tensor_shape_kernels.h @@ -14,14 +14,18 @@ #pragma once -namespace infrt::host_context { +namespace infrt { +namespace host_context { class KernelRegistry; -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt -namespace infrt::kernel { +namespace infrt { +namespace kernel { void RegisterTensorShapeKernels(host_context::KernelRegistry* registry); -} // namespace infrt::kernel +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc index d5f64d09b602f..ccfb3356a855f 100644 --- a/paddle/infrt/kernel/test_kernels.cc +++ b/paddle/infrt/kernel/test_kernels.cc @@ -33,7 +33,8 @@ using infrt::host_context::Attribute; using infrt::host_context::MlirFunctionExecutable; using infrt::host_context::RemainingArguments; -namespace infrt::kernel { +namespace infrt { +namespace kernel { namespace { class BenchmarkStats { public: @@ -197,4 +198,5 @@ void RegisterTestKernels(host_context::KernelRegistry *registry) { INFRT_KERNEL(ShadowCopyTensor)); } -} // namespace infrt::kernel +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/test_kernels.h b/paddle/infrt/kernel/test_kernels.h index f42884dfaf2c9..f5639ec1afaad 100644 --- a/paddle/infrt/kernel/test_kernels.h +++ b/paddle/infrt/kernel/test_kernels.h @@ -15,17 +15,21 @@ #pragma once #include -namespace infrt::host_context { +namespace infrt { +namespace host_context { struct KernelRegistry; -} // namespace infrt::host_context +} // namespace host_context +} // namespace infrt -namespace infrt::kernel { +namespace infrt { +namespace kernel { /** * Register all the test kernels to registry. */ void RegisterTestKernels(host_context::KernelRegistry* registry); -} // namespace infrt::kernel +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/paddle/cpp/desc_api.h b/paddle/infrt/paddle/cpp/desc_api.h index ccd79c048ab14..3b2dcb0018b2f 100644 --- a/paddle/infrt/paddle/cpp/desc_api.h +++ b/paddle/infrt/paddle/cpp/desc_api.h @@ -18,7 +18,9 @@ #include #include -namespace infrt::paddle::cpp { +namespace infrt { +namespace paddle { +namespace cpp { /* * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc @@ -226,4 +228,6 @@ class ProgramDescAPI { virtual void SetVersion(int64_t version) = 0; }; -} // namespace infrt::paddle::cpp +} // namespace cpp +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc index 285280e69435b..f3de1a630451c 100644 --- a/paddle/infrt/paddle/model_parser.cc +++ b/paddle/infrt/paddle/model_parser.cc @@ -22,7 +22,8 @@ #include "paddle/infrt/common/target.h" #include "paddle/infrt/common/type.h" -namespace infrt::paddle { +namespace infrt { +namespace paddle { int SizeOfType(framework_proto::VarType::Type type) { using Type = framework_proto::VarType::Type; @@ -169,4 +170,5 @@ void LoadParam(const std::string &path, _Variable *out, const Target &target) { LoadLoDTensor(fin, out, target); } -} // namespace infrt::paddle +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h index 73125fadedb82..373f77033dcef 100644 --- a/paddle/infrt/paddle/model_parser.h +++ b/paddle/infrt/paddle/model_parser.h @@ -25,7 +25,8 @@ #include "paddle/infrt/paddle/scope.h" #include "paddle/infrt/paddle/tensor.h" -namespace infrt::paddle { +namespace infrt { +namespace paddle { namespace framework_proto = ::paddle::framework::proto; // Read a __model__ file. @@ -52,4 +53,5 @@ void TensorFromStream( const common::Target& target = common::DefaultHostTarget()); void ReadBinaryFile(const std::string& filename, std::string* contents); -} // namespace infrt::paddle +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/pb/block_desc.cc b/paddle/infrt/paddle/pb/block_desc.cc index 11186bc68af16..5b28fa5464c54 100644 --- a/paddle/infrt/paddle/pb/block_desc.cc +++ b/paddle/infrt/paddle/pb/block_desc.cc @@ -14,7 +14,9 @@ #include "paddle/infrt/paddle/pb/block_desc.h" -namespace infrt::paddle::pb { +namespace infrt { +namespace paddle { +namespace pb { template <> framework_proto::VarDesc* BlockDesc::GetVar( @@ -40,4 +42,6 @@ framework_proto::OpDesc* BlockDesc::AddOp() { return desc_->add_ops(); } -} // namespace infrt::paddle::pb +} // namespace pb +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/pb/block_desc.h b/paddle/infrt/paddle/pb/block_desc.h index 9c1b7f9adf172..c9e325699a4bc 100644 --- a/paddle/infrt/paddle/pb/block_desc.h +++ b/paddle/infrt/paddle/pb/block_desc.h @@ -18,7 +18,9 @@ #include "paddle/infrt/paddle/cpp/desc_api.h" #include "paddle/infrt/paddle/framework.pb.h" -namespace infrt::paddle::pb { +namespace infrt { +namespace paddle { +namespace pb { namespace framework_proto = ::paddle::framework::proto; @@ -74,4 +76,6 @@ class BlockDesc : public cpp::BlockDescAPI { framework_proto::BlockDesc* desc_; // not_own }; -} // namespace infrt::paddle::pb +} // namespace pb +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/pb/op_desc.cc b/paddle/infrt/paddle/pb/op_desc.cc index c7b1e66f50642..32dcefb1ac684 100644 --- a/paddle/infrt/paddle/pb/op_desc.cc +++ b/paddle/infrt/paddle/pb/op_desc.cc @@ -14,7 +14,9 @@ #include "paddle/infrt/paddle/pb/op_desc.h" -namespace infrt::paddle::pb { +namespace infrt { +namespace paddle { +namespace pb { google::protobuf::internal::RepeatedPtrIterator FindAttr(framework_proto::OpDesc *desc, const std::string &name) { @@ -136,4 +138,6 @@ GET_ATTRS_IMPL(std::vector, strings); GET_ATTR_IMPL(std::string, s); GET_ATTRS_IMPL(std::vector, longs); -} // namespace infrt::paddle::pb +} // namespace pb +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/pb/op_desc.h b/paddle/infrt/paddle/pb/op_desc.h index 81d57d9f32252..2829f2aca2e08 100644 --- a/paddle/infrt/paddle/pb/op_desc.h +++ b/paddle/infrt/paddle/pb/op_desc.h @@ -19,7 +19,9 @@ #include "paddle/infrt/paddle/framework.pb.h" #include "paddle/infrt/support/variant.h" -namespace infrt::paddle::pb { +namespace infrt { +namespace paddle { +namespace pb { namespace framework_proto = ::paddle::framework::proto; @@ -195,4 +197,6 @@ template <> void OpDesc::SetAttr>(const std::string &name, const std::vector &v); -} // namespace infrt::paddle::pb +} // namespace pb +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/pb/program_desc.cc b/paddle/infrt/paddle/pb/program_desc.cc index ed8a7e36e0129..9d725485a974d 100644 --- a/paddle/infrt/paddle/pb/program_desc.cc +++ b/paddle/infrt/paddle/pb/program_desc.cc @@ -17,7 +17,9 @@ #include #include -namespace infrt::paddle::pb { +namespace infrt { +namespace paddle { +namespace pb { template <> framework_proto::BlockDesc* ProgramDesc::GetBlock( @@ -32,4 +34,6 @@ ProgramDesc::AddBlock() { return desc_->add_blocks(); } -} // namespace infrt::paddle::pb +} // namespace pb +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/pb/program_desc.h b/paddle/infrt/paddle/pb/program_desc.h index 4adad650c974d..b1e64f8e86611 100644 --- a/paddle/infrt/paddle/pb/program_desc.h +++ b/paddle/infrt/paddle/pb/program_desc.h @@ -21,7 +21,9 @@ #include "paddle/infrt/paddle/cpp/desc_api.h" #include "paddle/infrt/paddle/framework.pb.h" -namespace infrt::paddle::pb { +namespace infrt { +namespace paddle { +namespace pb { namespace framework_proto = ::paddle::framework::proto; class ProgramDesc : public cpp::ProgramDescAPI { @@ -58,4 +60,6 @@ class ProgramDesc : public cpp::ProgramDescAPI { framework_proto::ProgramDesc *desc_; // not_own }; -} // namespace infrt::paddle::pb +} // namespace pb +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/pb/var_desc.cc b/paddle/infrt/paddle/pb/var_desc.cc index cf80df4f1b845..7ea2e24da3446 100644 --- a/paddle/infrt/paddle/pb/var_desc.cc +++ b/paddle/infrt/paddle/pb/var_desc.cc @@ -19,7 +19,9 @@ #include "paddle/infrt/paddle/cpp/desc_api.h" #include "paddle/infrt/paddle/framework.pb.h" -namespace infrt::paddle::pb { +namespace infrt { +namespace paddle { +namespace pb { cpp::VarDescAPI::Type VarDesc::GetType() const { auto type = desc_->type().type(); @@ -364,4 +366,6 @@ VarDesc::mutable_tensor_descs() { return std::vector(); } -} // namespace infrt::paddle::pb +} // namespace pb +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/pb/var_desc.h b/paddle/infrt/paddle/pb/var_desc.h index 4cff5fdee0375..7215ba6bb6aa7 100644 --- a/paddle/infrt/paddle/pb/var_desc.h +++ b/paddle/infrt/paddle/pb/var_desc.h @@ -23,7 +23,9 @@ #include "paddle/infrt/paddle/cpp/desc_api.h" #include "paddle/infrt/paddle/framework.pb.h" -namespace infrt::paddle::pb { +namespace infrt { +namespace paddle { +namespace pb { namespace framework_proto = ::paddle::framework::proto; // convert between std::vector and protobuf repeated. @@ -121,4 +123,6 @@ class VarDesc : public cpp::VarDescAPI { framework_proto::VarDesc *desc_; }; -} // namespace infrt::paddle::pb +} // namespace pb +} // namespace paddle +} // namespace infrt From 87ee3e4f5438c567796e128b73eb7703aa56d2ec Mon Sep 17 00:00:00 2001 From: Zhangjingyu06 <92561254+Zhangjingyu06@users.noreply.github.com> Date: Fri, 14 Jan 2022 16:15:47 +0800 Subject: [PATCH 22/24] [XPU]add stack_grad op for kunlun2,*test=kunlun (#38674) * [XPU]add split op for kunlun2,*test=kunlun * [XPU]add split op for kunlun2,*test=kunlun * [XPU]add split op for kunlun,*test=kunlun * [XPU]add stack_grad op for kunlun2,*test=kunlun Co-authored-by: QingshuChen --- paddle/fluid/operators/stack_op_xpu.cc | 43 ++++++++++++++++--- .../fluid/platform/device/xpu/xpu1_op_list.h | 1 + .../fluid/platform/device/xpu/xpu2_op_list.h | 2 + .../tests/unittests/xpu/test_stack_op_xpu.py | 19 +++++++- 4 files changed, 58 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc index 01ec4a2b16b4a..a2590e1180c1a 100644 --- a/paddle/fluid/operators/stack_op_xpu.cc +++ b/paddle/fluid/operators/stack_op_xpu.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/stack_op.h" #include -#ifdef PADDLE_WITH_XPU +#include +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { @@ -59,14 +62,44 @@ class StackXPUKernel : public framework::OpKernel { } }; +template +class StackGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto dx = ctx.MultiOutput(framework::GradVarName("X")); + auto axis = ctx.Attr("axis"); + auto& dev_ctx = ctx.template device_context(); + auto dy_dims = dy->dims(); + + if (axis < 0) axis += dy_dims.size() + 1; + auto dy_shape = framework::vectorize(dy_dims); + + std::vector dx_dims_list(dx.size(), 1); + std::vector dx_lists; + for (auto out : dx) { + dx_lists.push_back(out->mutable_data(ctx.GetPlace())); + } + + int r = xpu::split(dev_ctx.x_context(), dy->data(), dx_lists, + dy_shape, dx_dims_list, axis); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "The stack_grad XPU kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + } // namespace operators } // namespace paddle namespace plat = paddle::platform; namespace ops = paddle::operators; - REGISTER_OP_XPU_KERNEL(stack, - ops::StackXPUKernel, + ops::StackXPUKernel, ops::StackXPUKernel, - ops::StackXPUKernel); + ops::StackXPUKernel); +REGISTER_OP_XPU_KERNEL(stack_grad, + ops::StackGradXPUKernel, + ops::StackGradXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h index 26a1426bea036..a76bdd4ae9679 100644 --- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h @@ -300,6 +300,7 @@ XPUOpMap& get_kl1_ops() { pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 79261a5d7bc88..3d140b4693a6f 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -333,6 +333,8 @@ XPUOpMap& get_kl2_ops() { {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, + {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py index 68e5a6ccdbfb7..20446aee41ec7 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -66,6 +66,15 @@ def test_check_output(self): place = paddle.XPUPlace(0) self.check_output_with_place(place) + def test_check_grad(self): + if self.dtype == 'int64' or self.dtype == 'int32': + pass + else: + if paddle.is_compiled_with_xpu(): + paddle.enable_static() + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, self.get_x_names(), 'Y') + class TestStackOp1(TestStackOpBase): def initParameters(self): @@ -81,11 +90,17 @@ class TestStackOp3(TestStackOpBase): def initParameters(self): self.axis = -1 + def test_check_grad(self): + pass + class TestStackOp4(TestStackOpBase): def initParameters(self): self.axis = -4 + def test_check_grad(self): + pass + class TestStackOp5(TestStackOpBase): def initParameters(self): @@ -113,7 +128,7 @@ def initDefaultParameters(self): self.num_inputs = 4 self.input_dim = (5, 6, 7) self.axis = 0 - self.dtype = 'int' + self.dtype = 'int32' def initParameters(self): self.num_inputs = 16 From 050aa6fe5a524b0e7b85201c54a0da315701518d Mon Sep 17 00:00:00 2001 From: heliqi Date: Fri, 14 Jan 2022 16:50:56 +0800 Subject: [PATCH 23/24] add flatten_contiguous_range OpConvert for Paddle-TRT (#38922) * add trt_convert_flatten_contiguous_rang op * trt version >7,support trt_convert_flatten_contiguous_rang * trt version >7,support trt_convert_flatten_contiguous_rang * trt version >7,support trt_convert_flatten_contiguous_rang * test cast add trt version >=7 skip --- .../ir_passes/tensorrt_subgraph_pass.cc | 7 +- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 2 +- .../convert/flatten_contiguous_range_op.cc | 136 ++++++++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 32 +++++ ...st_trt_convert_flatten_contiguous_range.py | 115 +++++++++++++++ 6 files changed, 290 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index ef50df3084f8c..55bbc55450876 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -46,8 +46,11 @@ void analysis::TensorRtSubgraphPass::ApplyImpl( << " is diabled by config in TensorRT"; return false; } - return tensorrt::OpTeller::Global().Tell(node, no_calib_int8, - with_dynamic_shape); + bool is_ok = tensorrt::OpTeller::Global().Tell(node, no_calib_int8, + with_dynamic_shape); + if (!is_ok) + VLOG(3) << node->Op()->Type().c_str() << " op is not in TensorRT"; + return is_ok; }; framework::ir::SubGraphFuser fuser( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2799fb9e174d3..d4b680288e347 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1416,6 +1416,7 @@ USE_TRT_CONVERTER(elementwise_min_tensor); USE_TRT_CONVERTER(elementwise_pow_tensor); USE_TRT_CONVERTER(transpose); USE_TRT_CONVERTER(flatten); +USE_TRT_CONVERTER(flatten_contiguous_range); USE_TRT_CONVERTER(matmul); USE_TRT_CONVERTER(conv2d); USE_TRT_CONVERTER(relu); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index a885b69fa7fbc..017caca6adc81 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -3,7 +3,7 @@ nv_library(tensorrt_converter SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc - shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc + shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc anchor_generator_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc new file mode 100644 index 0000000000000..706814340a0e9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc @@ -0,0 +1,136 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { +/* + * flatten_contiguous_range trt converter + */ +class FlattenContiguousRangeOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + int dims = input->getDimensions().nbDims; + int start_axis = BOOST_GET_CONST(int, op_desc.GetAttr("start_axis")); + int stop_axis = BOOST_GET_CONST(int, op_desc.GetAttr("stop_axis")); + + nvinfer1::IShuffleLayer* layer = nullptr; + if (!engine_->with_dynamic_shape()) { + if (start_axis < 0) start_axis += dims + 1; + if (stop_axis < 0) stop_axis += dims + 1; + int dim_prod = 1; + nvinfer1::Dims flatten_dim; + flatten_dim.nbDims = dims - (stop_axis - start_axis); + for (int i = 0, j = 0; i < dims; ++i) { + if (start_axis <= i + 1 && i + 1 <= stop_axis) { + int dim_i = input->getDimensions().d[i]; + PADDLE_ENFORCE_GT(dim_i, 0, platform::errors::InvalidArgument( + "flatten_contiguous_range input dim " + "should be > 0, but got %d.", + dim_i)); + dim_prod *= dim_i; + if (i + 1 == stop_axis) { + flatten_dim.d[j++] = dim_prod; + } + } else { + flatten_dim.d[j++] = input->getDimensions().d[i]; + } + } + layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + layer->setReshapeDimensions(flatten_dim); + } else { + if (start_axis < 0) start_axis += dims; + if (stop_axis < 0) stop_axis += dims; + auto* shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input); + auto* shape_layer_itensor = shape_layer->getOutput(0); + + nvinfer1::Dims start_dim, size_dim, stride_dim; + start_dim.nbDims = 1; + size_dim.nbDims = 1; + stride_dim.nbDims = 1; + start_dim.d[0] = start_axis; + size_dim.d[0] = stop_axis - start_axis + 1; + stride_dim.d[0] = 1; + auto* slice_layer = + TRT_ENGINE_ADD_LAYER(engine_, Slice, *shape_layer_itensor, start_dim, + size_dim, stride_dim); + uint32_t reduce_dim = 1; + auto* reduce_prod_layer = TRT_ENGINE_ADD_LAYER( + engine_, Reduce, *(slice_layer->getOutput(0)), + nvinfer1::ReduceOperation::kPROD, reduce_dim, true); + + nvinfer1::ITensor* input_shape = nullptr; + if (start_axis == 0 && stop_axis == dims - 1) { + input_shape = reduce_prod_layer->getOutput(0); + } else { + std::vector itensors; + if (start_axis > 0) { + nvinfer1::Dims left_start_dim, left_size_dim, left_stride_dim; + left_start_dim.nbDims = 1; + left_size_dim.nbDims = 1; + left_stride_dim.nbDims = 1; + left_start_dim.d[0] = 0; + left_size_dim.d[0] = start_axis; + left_stride_dim.d[0] = 1; + auto* slice_layer_left = TRT_ENGINE_ADD_LAYER( + engine_, Slice, *shape_layer_itensor, left_start_dim, + left_size_dim, left_stride_dim); + itensors.push_back(slice_layer_left->getOutput(0)); + } + itensors.push_back(reduce_prod_layer->getOutput(0)); + if (stop_axis < dims - 1) { + nvinfer1::Dims right_start_dim, right_size_dim, right_stride_dim; + right_start_dim.nbDims = 1; + right_size_dim.nbDims = 1; + right_stride_dim.nbDims = 1; + right_start_dim.d[0] = stop_axis + 1; + right_size_dim.d[0] = dims - stop_axis - 1; + right_stride_dim.d[0] = 1; + auto* slice_layer_right = TRT_ENGINE_ADD_LAYER( + engine_, Slice, *shape_layer_itensor, right_start_dim, + right_size_dim, right_stride_dim); + itensors.push_back(slice_layer_right->getOutput(0)); + } + auto* concat_layer = TRT_ENGINE_ADD_LAYER( + engine_, Concatenation, itensors.data(), itensors.size()); + concat_layer->setAxis(0); + input_shape = concat_layer->getOutput(0); + } + layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + layer->setInput(1, *input_shape); + } + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "flatten_contiguous_range", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(flatten_contiguous_range, + FlattenContiguousRangeOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index ddee4e0d682b0..6663103d4ca37 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -55,6 +55,7 @@ struct SimpleOpTypeSetTeller : public Teller { // #endif #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); + teller_set.insert("flatten_contiguous_range"); #endif #if CUDA_VERSION >= 10020 teller_set.insert("reshape"); @@ -531,6 +532,37 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (axis != 1) return false; } } + if (op_type == "flatten_contiguous_range") { + if (!with_dynamic_shape) { + int start_axis = BOOST_GET_CONST(int, desc.GetAttr("start_axis")); + int stop_axis = BOOST_GET_CONST(int, desc.GetAttr("stop_axis")); + auto x_var_name = desc.Input("X")[0]; + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + int dims = x_shape.size(); + if (start_axis < 0) start_axis += dims; + if (start_axis == 0) { + VLOG(3) << "TRT flatten_contiguous_range not support the " + "batch-dimension being changed"; + return false; + } + if (stop_axis < 0) stop_axis += dims; + for (int i = start_axis; i <= stop_axis; ++i) { + if (x_shape[i] < 0) { + VLOG(3) << "On TRT static shape,flatten_contiguous_range input dim " + "should be > 0"; + return false; + } + } + } + } if (op_type == "gather") { auto gather_inputs = desc.Inputs(); diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py new file mode 100644 index 0000000000000..a4060349d4bed --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import unittest +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertFlattenContiguousRangeTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input(batch): + return np.random.random([2, batch, 4, 8, 3]).astype(np.float32) + + for batch in [1, 2, 4]: + for start_axis in range(5): + for stop_axis in range(start_axis, 5): + type = "flatten_contiguous_range" + op_outputs = { + "Out": ["output_data"], + "XShape": ["xshape_data"] + } + ops_config = [{ + "op_type": type, + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": op_outputs, + "op_attrs": { + "start_axis": start_axis, + "stop_axis": stop_axis, + } + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input, batch)) + }, + outputs=["output_data"]) + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = {"input_data": [2, 1, 4, 8, 3]} + self.dynamic_shape.max_input_shape = {"input_data": [2, 4, 4, 8, 3]} + self.dynamic_shape.opt_input_shape = {"input_data": [2, 2, 4, 8, 3]} + + def clear_dynamic_shape(): + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7000: + if dynamic_shape: + return 1, 2 + else: + if attrs[0]['start_axis'] == 0: + return 0, 3 + else: + return 1, 2 + else: + return 0, 3 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() From a88791481484ab6a61540a737336d79c65d021dc Mon Sep 17 00:00:00 2001 From: zyfncg Date: Sat, 15 Jan 2022 12:39:49 +0800 Subject: [PATCH 24/24] fix performance problem caused by Conj (#38939) --- paddle/pten/kernels/complex_kernel.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h index b6074f117ea14..d12fc730fef87 100644 --- a/paddle/pten/kernels/complex_kernel.h +++ b/paddle/pten/kernels/complex_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/platform/complex.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/empty_kernel.h" @@ -23,7 +24,13 @@ namespace pten { template void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); -template +// If T is complex +template >::value || + std::is_same>::value, + bool> = true> DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { auto out_meta = UnchangedInferMeta(x.meta()); auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); @@ -31,4 +38,15 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { return dense_out; } +// If T is not complex +template >::value && + !std::is_same>::value, + bool> = true> +DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { + return x; +} + } // namespace pten